Home | History | Annotate | Line # | Download | only in aarch64
simulator.c revision 1.1.1.4
      1 /* simulator.c -- Interface for the AArch64 simulator.
      2 
      3    Copyright (C) 2015-2020 Free Software Foundation, Inc.
      4 
      5    Contributed by Red Hat.
      6 
      7    This file is part of GDB.
      8 
      9    This program is free software; you can redistribute it and/or modify
     10    it under the terms of the GNU General Public License as published by
     11    the Free Software Foundation; either version 3 of the License, or
     12    (at your option) any later version.
     13 
     14    This program is distributed in the hope that it will be useful,
     15    but WITHOUT ANY WARRANTY; without even the implied warranty of
     16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17    GNU General Public License for more details.
     18 
     19    You should have received a copy of the GNU General Public License
     20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
     21 
     22 #include "config.h"
     23 #include <stdlib.h>
     24 #include <stdio.h>
     25 #include <string.h>
     26 #include <sys/types.h>
     27 #include <math.h>
     28 #include <time.h>
     29 #include <limits.h>
     30 
     31 #include "simulator.h"
     32 #include "cpustate.h"
     33 #include "memory.h"
     34 
     35 #define NO_SP 0
     36 #define SP_OK 1
     37 
     38 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
     39 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
     40 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
     41 
     42 /* Space saver macro.  */
     43 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
     44 
     45 #define HALT_UNALLOC							\
     46   do									\
     47     {									\
     48       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
     49       TRACE_INSN (cpu,							\
     50 		  "Unallocated instruction detected at sim line %d,"	\
     51 		  " exe addr %" PRIx64,					\
     52 		  __LINE__, aarch64_get_PC (cpu));			\
     53       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
     54 		       sim_stopped, SIM_SIGILL);			\
     55     }									\
     56   while (0)
     57 
     58 #define HALT_NYI							\
     59   do									\
     60     {									\
     61       TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
     62       TRACE_INSN (cpu,							\
     63 		  "Unimplemented instruction detected at sim line %d,"	\
     64 		  " exe addr %" PRIx64,					\
     65 		  __LINE__, aarch64_get_PC (cpu));			\
     66       if (! TRACE_ANY_P (cpu))						\
     67         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
     68                         aarch64_get_instr (cpu));			\
     69       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
     70 		       sim_stopped, SIM_SIGABRT);			\
     71     }									\
     72   while (0)
     73 
     74 #define NYI_assert(HI, LO, EXPECTED)					\
     75   do									\
     76     {									\
     77       if (INSTR ((HI), (LO)) != (EXPECTED))				\
     78 	HALT_NYI;							\
     79     }									\
     80   while (0)
     81 
     82 /* Helper functions used by expandLogicalImmediate.  */
     83 
     84 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
     85 static inline uint64_t
     86 ones (int N)
     87 {
     88   return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
     89 }
     90 
     91 /* result<0> to val<N>  */
     92 static inline uint64_t
     93 pickbit (uint64_t val, int N)
     94 {
     95   return pickbits64 (val, N, N);
     96 }
     97 
     98 static uint64_t
     99 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
    100 {
    101   uint64_t mask;
    102   uint64_t imm;
    103   unsigned simd_size;
    104 
    105   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
    106      (in other words, right rotated by R), then replicated. */
    107   if (N != 0)
    108     {
    109       simd_size = 64;
    110       mask = 0xffffffffffffffffull;
    111     }
    112   else
    113     {
    114       switch (S)
    115 	{
    116 	case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
    117 	case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
    118 	case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
    119 	case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
    120 	case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
    121 	default: return 0;
    122 	}
    123       mask = (1ull << simd_size) - 1;
    124       /* Top bits are IGNORED.  */
    125       R &= simd_size - 1;
    126     }
    127 
    128   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
    129   if (S == simd_size - 1)
    130     return 0;
    131 
    132   /* S+1 consecutive bits to 1.  */
    133   /* NOTE: S can't be 63 due to detection above.  */
    134   imm = (1ull << (S + 1)) - 1;
    135 
    136   /* Rotate to the left by simd_size - R.  */
    137   if (R != 0)
    138     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
    139 
    140   /* Replicate the value according to SIMD size.  */
    141   switch (simd_size)
    142     {
    143     case  2: imm = (imm <<  2) | imm;
    144     case  4: imm = (imm <<  4) | imm;
    145     case  8: imm = (imm <<  8) | imm;
    146     case 16: imm = (imm << 16) | imm;
    147     case 32: imm = (imm << 32) | imm;
    148     case 64: break;
    149     default: return 0;
    150     }
    151 
    152   return imm;
    153 }
    154 
    155 /* Instr[22,10] encodes N immr and imms. we want a lookup table
    156    for each possible combination i.e. 13 bits worth of int entries.  */
    157 #define  LI_TABLE_SIZE  (1 << 13)
    158 static uint64_t LITable[LI_TABLE_SIZE];
    159 
    160 void
    161 aarch64_init_LIT_table (void)
    162 {
    163   unsigned index;
    164 
    165   for (index = 0; index < LI_TABLE_SIZE; index++)
    166     {
    167       uint32_t N    = uimm (index, 12, 12);
    168       uint32_t immr = uimm (index, 11, 6);
    169       uint32_t imms = uimm (index, 5, 0);
    170 
    171       LITable [index] = expand_logical_immediate (imms, immr, N);
    172     }
    173 }
    174 
    175 static void
    176 dexNotify (sim_cpu *cpu)
    177 {
    178   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
    179                            2 ==> exit Java, 3 ==> start next bytecode.  */
    180   uint32_t type = INSTR (14, 0);
    181 
    182   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
    183 
    184   switch (type)
    185     {
    186     case 0:
    187       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
    188 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
    189       break;
    190     case 1:
    191       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
    192 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
    193       break;
    194     case 2:
    195       /* aarch64_notifyMethodExit ();  */
    196       break;
    197     case 3:
    198       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
    199 	 aarch64_get_reg_u64 (cpu, R22, 0));  */
    200       break;
    201     }
    202 }
    203 
    204 /* secondary decode within top level groups  */
    205 
    206 static void
    207 dexPseudo (sim_cpu *cpu)
    208 {
    209   /* assert instr[28,27] = 00
    210 
    211      We provide 2 pseudo instructions:
    212 
    213      HALT stops execution of the simulator causing an immediate
    214      return to the x86 code which entered it.
    215 
    216      CALLOUT initiates recursive entry into x86 code.  A register
    217      argument holds the address of the x86 routine.  Immediate
    218      values in the instruction identify the number of general
    219      purpose and floating point register arguments to be passed
    220      and the type of any value to be returned.  */
    221 
    222   uint32_t PSEUDO_HALT      =  0xE0000000U;
    223   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
    224   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
    225   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
    226   uint32_t dispatch;
    227 
    228   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
    229     {
    230       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
    231       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
    232 		       sim_stopped, SIM_SIGTRAP);
    233     }
    234 
    235   dispatch = INSTR (31, 15);
    236 
    237   /* We do not handle callouts at the moment.  */
    238   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
    239     {
    240       TRACE_EVENTS (cpu, " Callout");
    241       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
    242 		       sim_stopped, SIM_SIGABRT);
    243     }
    244 
    245   else if (dispatch == PSEUDO_NOTIFY)
    246     dexNotify (cpu);
    247 
    248   else
    249     HALT_UNALLOC;
    250 }
    251 
    252 /* Load-store single register (unscaled offset)
    253    These instructions employ a base register plus an unscaled signed
    254    9 bit offset.
    255 
    256    N.B. the base register (source) can be Xn or SP. all other
    257    registers may not be SP.  */
    258 
    259 /* 32 bit load 32 bit unscaled signed 9 bit.  */
    260 static void
    261 ldur32 (sim_cpu *cpu, int32_t offset)
    262 {
    263   unsigned rn = INSTR (9, 5);
    264   unsigned rt = INSTR (4, 0);
    265 
    266   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    267   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
    268 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    269 			+ offset));
    270 }
    271 
    272 /* 64 bit load 64 bit unscaled signed 9 bit.  */
    273 static void
    274 ldur64 (sim_cpu *cpu, int32_t offset)
    275 {
    276   unsigned rn = INSTR (9, 5);
    277   unsigned rt = INSTR (4, 0);
    278 
    279   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    280   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
    281 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    282 			+ offset));
    283 }
    284 
    285 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
    286 static void
    287 ldurb32 (sim_cpu *cpu, int32_t offset)
    288 {
    289   unsigned rn = INSTR (9, 5);
    290   unsigned rt = INSTR (4, 0);
    291 
    292   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    293   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
    294 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    295 			+ offset));
    296 }
    297 
    298 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
    299 static void
    300 ldursb32 (sim_cpu *cpu, int32_t offset)
    301 {
    302   unsigned rn = INSTR (9, 5);
    303   unsigned rt = INSTR (4, 0);
    304 
    305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    306   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
    307 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    308 			+ offset));
    309 }
    310 
    311 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
    312 static void
    313 ldursb64 (sim_cpu *cpu, int32_t offset)
    314 {
    315   unsigned rn = INSTR (9, 5);
    316   unsigned rt = INSTR (4, 0);
    317 
    318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    319   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
    320 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    321 			+ offset));
    322 }
    323 
    324 /* 32 bit load zero-extended short unscaled signed 9 bit  */
    325 static void
    326 ldurh32 (sim_cpu *cpu, int32_t offset)
    327 {
    328   unsigned rn = INSTR (9, 5);
    329   unsigned rd = INSTR (4, 0);
    330 
    331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    332   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
    333 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    334 			+ offset));
    335 }
    336 
    337 /* 32 bit load sign-extended short unscaled signed 9 bit  */
    338 static void
    339 ldursh32 (sim_cpu *cpu, int32_t offset)
    340 {
    341   unsigned rn = INSTR (9, 5);
    342   unsigned rd = INSTR (4, 0);
    343 
    344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    345   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
    346 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    347 			+ offset));
    348 }
    349 
    350 /* 64 bit load sign-extended short unscaled signed 9 bit  */
    351 static void
    352 ldursh64 (sim_cpu *cpu, int32_t offset)
    353 {
    354   unsigned rn = INSTR (9, 5);
    355   unsigned rt = INSTR (4, 0);
    356 
    357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    358   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
    359 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    360 			+ offset));
    361 }
    362 
    363 /* 64 bit load sign-extended word unscaled signed 9 bit  */
    364 static void
    365 ldursw (sim_cpu *cpu, int32_t offset)
    366 {
    367   unsigned rn = INSTR (9, 5);
    368   unsigned rd = INSTR (4, 0);
    369 
    370   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    371   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
    372 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    373 			+ offset));
    374 }
    375 
    376 /* N.B. with stores the value in source is written to the address
    377    identified by source2 modified by offset.  */
    378 
    379 /* 32 bit store 32 bit unscaled signed 9 bit.  */
    380 static void
    381 stur32 (sim_cpu *cpu, int32_t offset)
    382 {
    383   unsigned rn = INSTR (9, 5);
    384   unsigned rd = INSTR (4, 0);
    385 
    386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    387   aarch64_set_mem_u32 (cpu,
    388 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
    389 		       aarch64_get_reg_u32 (cpu, rd, NO_SP));
    390 }
    391 
    392 /* 64 bit store 64 bit unscaled signed 9 bit  */
    393 static void
    394 stur64 (sim_cpu *cpu, int32_t offset)
    395 {
    396   unsigned rn = INSTR (9, 5);
    397   unsigned rd = INSTR (4, 0);
    398 
    399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    400   aarch64_set_mem_u64 (cpu,
    401 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
    402 		       aarch64_get_reg_u64 (cpu, rd, NO_SP));
    403 }
    404 
    405 /* 32 bit store byte unscaled signed 9 bit  */
    406 static void
    407 sturb (sim_cpu *cpu, int32_t offset)
    408 {
    409   unsigned rn = INSTR (9, 5);
    410   unsigned rd = INSTR (4, 0);
    411 
    412   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    413   aarch64_set_mem_u8 (cpu,
    414 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
    415 		      aarch64_get_reg_u8 (cpu, rd, NO_SP));
    416 }
    417 
    418 /* 32 bit store short unscaled signed 9 bit  */
    419 static void
    420 sturh (sim_cpu *cpu, int32_t offset)
    421 {
    422   unsigned rn = INSTR (9, 5);
    423   unsigned rd = INSTR (4, 0);
    424 
    425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    426   aarch64_set_mem_u16 (cpu,
    427 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
    428 		       aarch64_get_reg_u16 (cpu, rd, NO_SP));
    429 }
    430 
    431 /* Load single register pc-relative label
    432    Offset is a signed 19 bit immediate count in words
    433    rt may not be SP.  */
    434 
    435 /* 32 bit pc-relative load  */
    436 static void
    437 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
    438 {
    439   unsigned rd = INSTR (4, 0);
    440 
    441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    442   aarch64_set_reg_u64 (cpu, rd, NO_SP,
    443 		       aarch64_get_mem_u32
    444 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
    445 }
    446 
    447 /* 64 bit pc-relative load  */
    448 static void
    449 ldr_pcrel (sim_cpu *cpu, int32_t offset)
    450 {
    451   unsigned rd = INSTR (4, 0);
    452 
    453   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    454   aarch64_set_reg_u64 (cpu, rd, NO_SP,
    455 		       aarch64_get_mem_u64
    456 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
    457 }
    458 
    459 /* sign extended 32 bit pc-relative load  */
    460 static void
    461 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
    462 {
    463   unsigned rd = INSTR (4, 0);
    464 
    465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    466   aarch64_set_reg_u64 (cpu, rd, NO_SP,
    467 		       aarch64_get_mem_s32
    468 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
    469 }
    470 
    471 /* float pc-relative load  */
    472 static void
    473 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
    474 {
    475   unsigned int rd = INSTR (4, 0);
    476 
    477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    478   aarch64_set_vec_u32 (cpu, rd, 0,
    479 		       aarch64_get_mem_u32
    480 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
    481 }
    482 
    483 /* double pc-relative load  */
    484 static void
    485 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
    486 {
    487   unsigned int st = INSTR (4, 0);
    488 
    489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    490   aarch64_set_vec_u64 (cpu, st, 0,
    491 		       aarch64_get_mem_u64
    492 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
    493 }
    494 
    495 /* long double pc-relative load.  */
    496 static void
    497 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
    498 {
    499   unsigned int st = INSTR (4, 0);
    500   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
    501   FRegister a;
    502 
    503   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    504   aarch64_get_mem_long_double (cpu, addr, & a);
    505   aarch64_set_FP_long_double (cpu, st, a);
    506 }
    507 
    508 /* This can be used to scale an offset by applying
    509    the requisite shift. the second argument is either
    510    16, 32 or 64.  */
    511 
    512 #define SCALE(_offset, _elementSize) \
    513     ((_offset) << ScaleShift ## _elementSize)
    514 
    515 /* This can be used to optionally scale a register derived offset
    516    by applying the requisite shift as indicated by the Scaling
    517    argument.  The second argument is either Byte, Short, Word
    518    or Long. The third argument is either Scaled or Unscaled.
    519    N.B. when _Scaling is Scaled the shift gets ANDed with
    520    all 1s while when it is Unscaled it gets ANDed with 0.  */
    521 
    522 #define OPT_SCALE(_offset, _elementType, _Scaling) \
    523   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
    524 
    525 /* This can be used to zero or sign extend a 32 bit register derived
    526    value to a 64 bit value.  the first argument must be the value as
    527    a uint32_t and the second must be either UXTW or SXTW. The result
    528    is returned as an int64_t.  */
    529 
    530 static inline int64_t
    531 extend (uint32_t value, Extension extension)
    532 {
    533   union
    534   {
    535     uint32_t u;
    536     int32_t   n;
    537   } x;
    538 
    539   /* A branchless variant of this ought to be possible.  */
    540   if (extension == UXTW || extension == NoExtension)
    541     return value;
    542 
    543   x.u = value;
    544   return x.n;
    545 }
    546 
    547 /* Scalar Floating Point
    548 
    549    FP load/store single register (4 addressing modes)
    550 
    551    N.B. the base register (source) can be the stack pointer.
    552    The secondary source register (source2) can only be an Xn register.  */
    553 
    554 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
    555 static void
    556 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
    557 {
    558   unsigned rn = INSTR (9, 5);
    559   unsigned st = INSTR (4, 0);
    560   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    561 
    562   if (wb != Post)
    563     address += offset;
    564 
    565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    566   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
    567   if (wb == Post)
    568     address += offset;
    569 
    570   if (wb != NoWriteBack)
    571     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
    572 }
    573 
    574 /* Load 8 bit with unsigned 12 bit offset.  */
    575 static void
    576 fldrb_abs (sim_cpu *cpu, uint32_t offset)
    577 {
    578   unsigned rd = INSTR (4, 0);
    579   unsigned rn = INSTR (9, 5);
    580   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
    581 
    582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    583   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
    584 }
    585 
    586 /* Load 16 bit scaled unsigned 12 bit.  */
    587 static void
    588 fldrh_abs (sim_cpu *cpu, uint32_t offset)
    589 {
    590   unsigned rd = INSTR (4, 0);
    591   unsigned rn = INSTR (9, 5);
    592   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
    593 
    594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    595   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
    596 }
    597 
    598 /* Load 32 bit scaled unsigned 12 bit.  */
    599 static void
    600 fldrs_abs (sim_cpu *cpu, uint32_t offset)
    601 {
    602   unsigned rd = INSTR (4, 0);
    603   unsigned rn = INSTR (9, 5);
    604   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
    605 
    606   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    607   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
    608 }
    609 
    610 /* Load 64 bit scaled unsigned 12 bit.  */
    611 static void
    612 fldrd_abs (sim_cpu *cpu, uint32_t offset)
    613 {
    614   unsigned rd = INSTR (4, 0);
    615   unsigned rn = INSTR (9, 5);
    616   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
    617 
    618   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    619   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
    620 }
    621 
    622 /* Load 128 bit scaled unsigned 12 bit.  */
    623 static void
    624 fldrq_abs (sim_cpu *cpu, uint32_t offset)
    625 {
    626   unsigned rd = INSTR (4, 0);
    627   unsigned rn = INSTR (9, 5);
    628   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
    629 
    630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    631   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
    632   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
    633 }
    634 
    635 /* Load 32 bit scaled or unscaled zero- or sign-extended
    636    32-bit register offset.  */
    637 static void
    638 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
    639 {
    640   unsigned rm = INSTR (20, 16);
    641   unsigned rn = INSTR (9, 5);
    642   unsigned st = INSTR (4, 0);
    643   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    644   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
    645   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
    646 
    647   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    648   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
    649 		       (cpu, address + displacement));
    650 }
    651 
    652 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
    653 static void
    654 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
    655 {
    656   unsigned rn = INSTR (9, 5);
    657   unsigned st = INSTR (4, 0);
    658   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    659 
    660   if (wb != Post)
    661     address += offset;
    662 
    663   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    664   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
    665 
    666   if (wb == Post)
    667     address += offset;
    668 
    669   if (wb != NoWriteBack)
    670     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
    671 }
    672 
    673 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
    674 static void
    675 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
    676 {
    677   unsigned rm = INSTR (20, 16);
    678   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
    679   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
    680 
    681   fldrd_wb (cpu, displacement, NoWriteBack);
    682 }
    683 
    684 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
    685 static void
    686 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
    687 {
    688   FRegister a;
    689   unsigned rn = INSTR (9, 5);
    690   unsigned st = INSTR (4, 0);
    691   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    692 
    693   if (wb != Post)
    694     address += offset;
    695 
    696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    697   aarch64_get_mem_long_double (cpu, address, & a);
    698   aarch64_set_FP_long_double (cpu, st, a);
    699 
    700   if (wb == Post)
    701     address += offset;
    702 
    703   if (wb != NoWriteBack)
    704     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
    705 }
    706 
    707 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
    708 static void
    709 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
    710 {
    711   unsigned rm = INSTR (20, 16);
    712   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
    713   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
    714 
    715   fldrq_wb (cpu, displacement, NoWriteBack);
    716 }
    717 
    718 /* Memory Access
    719 
    720    load-store single register
    721    There are four addressing modes available here which all employ a
    722    64 bit source (base) register.
    723 
    724    N.B. the base register (source) can be the stack pointer.
    725    The secondary source register (source2)can only be an Xn register.
    726 
    727    Scaled, 12-bit, unsigned immediate offset, without pre- and
    728    post-index options.
    729    Unscaled, 9-bit, signed immediate offset with pre- or post-index
    730    writeback.
    731    scaled or unscaled 64-bit register offset.
    732    scaled or unscaled 32-bit extended register offset.
    733 
    734    All offsets are assumed to be raw from the decode i.e. the
    735    simulator is expected to adjust scaled offsets based on the
    736    accessed data size with register or extended register offset
    737    versions the same applies except that in the latter case the
    738    operation may also require a sign extend.
    739 
    740    A separate method is provided for each possible addressing mode.  */
    741 
    742 /* 32 bit load 32 bit scaled unsigned 12 bit  */
    743 static void
    744 ldr32_abs (sim_cpu *cpu, uint32_t offset)
    745 {
    746   unsigned rn = INSTR (9, 5);
    747   unsigned rt = INSTR (4, 0);
    748 
    749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    750   /* The target register may not be SP but the source may be.  */
    751   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
    752 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    753 			+ SCALE (offset, 32)));
    754 }
    755 
    756 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
    757 static void
    758 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
    759 {
    760   unsigned rn = INSTR (9, 5);
    761   unsigned rt = INSTR (4, 0);
    762   uint64_t address;
    763 
    764   if (rn == rt && wb != NoWriteBack)
    765     HALT_UNALLOC;
    766 
    767   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    768 
    769   if (wb != Post)
    770     address += offset;
    771 
    772   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    773   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
    774 
    775   if (wb == Post)
    776     address += offset;
    777 
    778   if (wb != NoWriteBack)
    779     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
    780 }
    781 
    782 /* 32 bit load 32 bit scaled or unscaled
    783    zero- or sign-extended 32-bit register offset  */
    784 static void
    785 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
    786 {
    787   unsigned rm = INSTR (20, 16);
    788   unsigned rn = INSTR (9, 5);
    789   unsigned rt = INSTR (4, 0);
    790   /* rn may reference SP, rm and rt must reference ZR  */
    791 
    792   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    793   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
    794   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
    795 
    796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    797   aarch64_set_reg_u64 (cpu, rt, NO_SP,
    798 		       aarch64_get_mem_u32 (cpu, address + displacement));
    799 }
    800 
    801 /* 64 bit load 64 bit scaled unsigned 12 bit  */
    802 static void
    803 ldr_abs (sim_cpu *cpu, uint32_t offset)
    804 {
    805   unsigned rn = INSTR (9, 5);
    806   unsigned rt = INSTR (4, 0);
    807 
    808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    809   /* The target register may not be SP but the source may be.  */
    810   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
    811 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    812 			+ SCALE (offset, 64)));
    813 }
    814 
    815 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
    816 static void
    817 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
    818 {
    819   unsigned rn = INSTR (9, 5);
    820   unsigned rt = INSTR (4, 0);
    821   uint64_t address;
    822 
    823   if (rn == rt && wb != NoWriteBack)
    824     HALT_UNALLOC;
    825 
    826   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    827 
    828   if (wb != Post)
    829     address += offset;
    830 
    831   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    832   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
    833 
    834   if (wb == Post)
    835     address += offset;
    836 
    837   if (wb != NoWriteBack)
    838     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
    839 }
    840 
    841 /* 64 bit load 64 bit scaled or unscaled zero-
    842    or sign-extended 32-bit register offset.  */
    843 static void
    844 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
    845 {
    846   unsigned rm = INSTR (20, 16);
    847   unsigned rn = INSTR (9, 5);
    848   unsigned rt = INSTR (4, 0);
    849   /* rn may reference SP, rm and rt must reference ZR  */
    850 
    851   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    852   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
    853   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
    854 
    855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    856   aarch64_set_reg_u64 (cpu, rt, NO_SP,
    857 		       aarch64_get_mem_u64 (cpu, address + displacement));
    858 }
    859 
    860 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
    861 static void
    862 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
    863 {
    864   unsigned rn = INSTR (9, 5);
    865   unsigned rt = INSTR (4, 0);
    866 
    867   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    868   /* The target register may not be SP but the source may be
    869      there is no scaling required for a byte load.  */
    870   aarch64_set_reg_u64 (cpu, rt, NO_SP,
    871 		       aarch64_get_mem_u8
    872 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
    873 }
    874 
    875 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
    876 static void
    877 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
    878 {
    879   unsigned rn = INSTR (9, 5);
    880   unsigned rt = INSTR (4, 0);
    881   uint64_t address;
    882 
    883   if (rn == rt && wb != NoWriteBack)
    884     HALT_UNALLOC;
    885 
    886   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    887 
    888   if (wb != Post)
    889     address += offset;
    890 
    891   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    892   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
    893 
    894   if (wb == Post)
    895     address += offset;
    896 
    897   if (wb != NoWriteBack)
    898     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
    899 }
    900 
    901 /* 32 bit load zero-extended byte scaled or unscaled zero-
    902    or sign-extended 32-bit register offset.  */
    903 static void
    904 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
    905 {
    906   unsigned rm = INSTR (20, 16);
    907   unsigned rn = INSTR (9, 5);
    908   unsigned rt = INSTR (4, 0);
    909   /* rn may reference SP, rm and rt must reference ZR  */
    910 
    911   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    912   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
    913 				 extension);
    914 
    915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    916   /* There is no scaling required for a byte load.  */
    917   aarch64_set_reg_u64 (cpu, rt, NO_SP,
    918 		       aarch64_get_mem_u8 (cpu, address + displacement));
    919 }
    920 
    921 /* 64 bit load sign-extended byte unscaled signed 9 bit
    922    with pre- or post-writeback.  */
    923 static void
    924 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
    925 {
    926   unsigned rn = INSTR (9, 5);
    927   unsigned rt = INSTR (4, 0);
    928   uint64_t address;
    929   int64_t val;
    930 
    931   if (rn == rt && wb != NoWriteBack)
    932     HALT_UNALLOC;
    933 
    934   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    935 
    936   if (wb != Post)
    937     address += offset;
    938 
    939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    940   val = aarch64_get_mem_s8 (cpu, address);
    941   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
    942 
    943   if (wb == Post)
    944     address += offset;
    945 
    946   if (wb != NoWriteBack)
    947     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
    948 }
    949 
    950 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
    951 static void
    952 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
    953 {
    954   ldrsb_wb (cpu, offset, NoWriteBack);
    955 }
    956 
    957 /* 64 bit load sign-extended byte scaled or unscaled zero-
    958    or sign-extended 32-bit register offset.  */
    959 static void
    960 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
    961 {
    962   unsigned rm = INSTR (20, 16);
    963   unsigned rn = INSTR (9, 5);
    964   unsigned rt = INSTR (4, 0);
    965   /* rn may reference SP, rm and rt must reference ZR  */
    966 
    967   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
    968   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
    969 				 extension);
    970   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    971   /* There is no scaling required for a byte load.  */
    972   aarch64_set_reg_s64 (cpu, rt, NO_SP,
    973 		       aarch64_get_mem_s8 (cpu, address + displacement));
    974 }
    975 
    976 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
    977 static void
    978 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
    979 {
    980   unsigned rn = INSTR (9, 5);
    981   unsigned rt = INSTR (4, 0);
    982   uint32_t val;
    983 
    984   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
    985   /* The target register may not be SP but the source may be.  */
    986   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
    987 			     + SCALE (offset, 16));
    988   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
    989 }
    990 
    991 /* 32 bit load zero-extended short unscaled signed 9 bit
    992    with pre- or post-writeback.  */
    993 static void
    994 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
    995 {
    996   unsigned rn = INSTR (9, 5);
    997   unsigned rt = INSTR (4, 0);
    998   uint64_t address;
    999 
   1000   if (rn == rt && wb != NoWriteBack)
   1001     HALT_UNALLOC;
   1002 
   1003   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1004 
   1005   if (wb != Post)
   1006     address += offset;
   1007 
   1008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1009   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
   1010 
   1011   if (wb == Post)
   1012     address += offset;
   1013 
   1014   if (wb != NoWriteBack)
   1015     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   1016 }
   1017 
   1018 /* 32 bit load zero-extended short scaled or unscaled zero-
   1019    or sign-extended 32-bit register offset.  */
   1020 static void
   1021 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1022 {
   1023   unsigned rm = INSTR (20, 16);
   1024   unsigned rn = INSTR (9, 5);
   1025   unsigned rt = INSTR (4, 0);
   1026   /* rn may reference SP, rm and rt must reference ZR  */
   1027 
   1028   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1029   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   1030   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
   1031 
   1032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1033   aarch64_set_reg_u32 (cpu, rt, NO_SP,
   1034 		       aarch64_get_mem_u16 (cpu, address + displacement));
   1035 }
   1036 
   1037 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
   1038 static void
   1039 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
   1040 {
   1041   unsigned rn = INSTR (9, 5);
   1042   unsigned rt = INSTR (4, 0);
   1043   int32_t val;
   1044 
   1045   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1046   /* The target register may not be SP but the source may be.  */
   1047   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
   1048 			     + SCALE (offset, 16));
   1049   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
   1050 }
   1051 
   1052 /* 32 bit load sign-extended short unscaled signed 9 bit
   1053    with pre- or post-writeback.  */
   1054 static void
   1055 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   1056 {
   1057   unsigned rn = INSTR (9, 5);
   1058   unsigned rt = INSTR (4, 0);
   1059   uint64_t address;
   1060 
   1061   if (rn == rt && wb != NoWriteBack)
   1062     HALT_UNALLOC;
   1063 
   1064   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1065 
   1066   if (wb != Post)
   1067     address += offset;
   1068 
   1069   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1070   aarch64_set_reg_s32 (cpu, rt, NO_SP,
   1071 		       (int32_t) aarch64_get_mem_s16 (cpu, address));
   1072 
   1073   if (wb == Post)
   1074     address += offset;
   1075 
   1076   if (wb != NoWriteBack)
   1077     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   1078 }
   1079 
   1080 /* 32 bit load sign-extended short scaled or unscaled zero-
   1081    or sign-extended 32-bit register offset.  */
   1082 static void
   1083 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1084 {
   1085   unsigned rm = INSTR (20, 16);
   1086   unsigned rn = INSTR (9, 5);
   1087   unsigned rt = INSTR (4, 0);
   1088   /* rn may reference SP, rm and rt must reference ZR  */
   1089 
   1090   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1091   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   1092   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
   1093 
   1094   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1095   aarch64_set_reg_s32 (cpu, rt, NO_SP,
   1096 		       (int32_t) aarch64_get_mem_s16
   1097 		       (cpu, address + displacement));
   1098 }
   1099 
   1100 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
   1101 static void
   1102 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
   1103 {
   1104   unsigned rn = INSTR (9, 5);
   1105   unsigned rt = INSTR (4, 0);
   1106   int64_t val;
   1107 
   1108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1109   /* The target register may not be SP but the source may be.  */
   1110   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
   1111 			      + SCALE (offset, 16));
   1112   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
   1113 }
   1114 
   1115 /* 64 bit load sign-extended short unscaled signed 9 bit
   1116    with pre- or post-writeback.  */
   1117 static void
   1118 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   1119 {
   1120   unsigned rn = INSTR (9, 5);
   1121   unsigned rt = INSTR (4, 0);
   1122   uint64_t address;
   1123   int64_t val;
   1124 
   1125   if (rn == rt && wb != NoWriteBack)
   1126     HALT_UNALLOC;
   1127 
   1128   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1129   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1130 
   1131   if (wb != Post)
   1132     address += offset;
   1133 
   1134   val = aarch64_get_mem_s16 (cpu, address);
   1135   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
   1136 
   1137   if (wb == Post)
   1138     address += offset;
   1139 
   1140   if (wb != NoWriteBack)
   1141     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   1142 }
   1143 
   1144 /* 64 bit load sign-extended short scaled or unscaled zero-
   1145    or sign-extended 32-bit register offset.  */
   1146 static void
   1147 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1148 {
   1149   unsigned rm = INSTR (20, 16);
   1150   unsigned rn = INSTR (9, 5);
   1151   unsigned rt = INSTR (4, 0);
   1152 
   1153   /* rn may reference SP, rm and rt must reference ZR  */
   1154 
   1155   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1156   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   1157   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
   1158   int64_t val;
   1159 
   1160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1161   val = aarch64_get_mem_s16 (cpu, address + displacement);
   1162   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
   1163 }
   1164 
   1165 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
   1166 static void
   1167 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
   1168 {
   1169   unsigned rn = INSTR (9, 5);
   1170   unsigned rt = INSTR (4, 0);
   1171   int64_t val;
   1172 
   1173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1174   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
   1175 			     + SCALE (offset, 32));
   1176   /* The target register may not be SP but the source may be.  */
   1177   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
   1178 }
   1179 
   1180 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
   1181    with pre- or post-writeback.  */
   1182 static void
   1183 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   1184 {
   1185   unsigned rn = INSTR (9, 5);
   1186   unsigned rt = INSTR (4, 0);
   1187   uint64_t address;
   1188 
   1189   if (rn == rt && wb != NoWriteBack)
   1190     HALT_UNALLOC;
   1191 
   1192   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1193 
   1194   if (wb != Post)
   1195     address += offset;
   1196 
   1197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1198   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
   1199 
   1200   if (wb == Post)
   1201     address += offset;
   1202 
   1203   if (wb != NoWriteBack)
   1204     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   1205 }
   1206 
   1207 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
   1208    or sign-extended 32-bit register offset.  */
   1209 static void
   1210 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1211 {
   1212   unsigned rm = INSTR (20, 16);
   1213   unsigned rn = INSTR (9, 5);
   1214   unsigned rt = INSTR (4, 0);
   1215   /* rn may reference SP, rm and rt must reference ZR  */
   1216 
   1217   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1218   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   1219   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
   1220 
   1221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1222   aarch64_set_reg_s64 (cpu, rt, NO_SP,
   1223 		       aarch64_get_mem_s32 (cpu, address + displacement));
   1224 }
   1225 
   1226 /* N.B. with stores the value in source is written to the
   1227    address identified by source2 modified by source3/offset.  */
   1228 
   1229 /* 32 bit store scaled unsigned 12 bit.  */
   1230 static void
   1231 str32_abs (sim_cpu *cpu, uint32_t offset)
   1232 {
   1233   unsigned rn = INSTR (9, 5);
   1234   unsigned rt = INSTR (4, 0);
   1235 
   1236   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1237   /* The target register may not be SP but the source may be.  */
   1238   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
   1239 			     + SCALE (offset, 32)),
   1240 		       aarch64_get_reg_u32 (cpu, rt, NO_SP));
   1241 }
   1242 
   1243 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
   1244 static void
   1245 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   1246 {
   1247   unsigned rn = INSTR (9, 5);
   1248   unsigned rt = INSTR (4, 0);
   1249   uint64_t address;
   1250 
   1251   if (rn == rt && wb != NoWriteBack)
   1252     HALT_UNALLOC;
   1253 
   1254   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1255   if (wb != Post)
   1256     address += offset;
   1257 
   1258   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1259   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
   1260 
   1261   if (wb == Post)
   1262     address += offset;
   1263 
   1264   if (wb != NoWriteBack)
   1265     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   1266 }
   1267 
   1268 /* 32 bit store scaled or unscaled zero- or
   1269    sign-extended 32-bit register offset.  */
   1270 static void
   1271 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1272 {
   1273   unsigned rm = INSTR (20, 16);
   1274   unsigned rn = INSTR (9, 5);
   1275   unsigned rt = INSTR (4, 0);
   1276 
   1277   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1278   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   1279   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
   1280 
   1281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1282   aarch64_set_mem_u32 (cpu, address + displacement,
   1283 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
   1284 }
   1285 
   1286 /* 64 bit store scaled unsigned 12 bit.  */
   1287 static void
   1288 str_abs (sim_cpu *cpu, uint32_t offset)
   1289 {
   1290   unsigned rn = INSTR (9, 5);
   1291   unsigned rt = INSTR (4, 0);
   1292 
   1293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1294   aarch64_set_mem_u64 (cpu,
   1295 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
   1296 		       + SCALE (offset, 64),
   1297 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
   1298 }
   1299 
   1300 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
   1301 static void
   1302 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   1303 {
   1304   unsigned rn = INSTR (9, 5);
   1305   unsigned rt = INSTR (4, 0);
   1306   uint64_t address;
   1307 
   1308   if (rn == rt && wb != NoWriteBack)
   1309     HALT_UNALLOC;
   1310 
   1311   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1312 
   1313   if (wb != Post)
   1314     address += offset;
   1315 
   1316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1317   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
   1318 
   1319   if (wb == Post)
   1320     address += offset;
   1321 
   1322   if (wb != NoWriteBack)
   1323     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   1324 }
   1325 
   1326 /* 64 bit store scaled or unscaled zero-
   1327    or sign-extended 32-bit register offset.  */
   1328 static void
   1329 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1330 {
   1331   unsigned rm = INSTR (20, 16);
   1332   unsigned rn = INSTR (9, 5);
   1333   unsigned rt = INSTR (4, 0);
   1334   /* rn may reference SP, rm and rt must reference ZR  */
   1335 
   1336   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1337   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   1338 			       extension);
   1339   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
   1340 
   1341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1342   aarch64_set_mem_u64 (cpu, address + displacement,
   1343 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
   1344 }
   1345 
   1346 /* 32 bit store byte scaled unsigned 12 bit.  */
   1347 static void
   1348 strb_abs (sim_cpu *cpu, uint32_t offset)
   1349 {
   1350   unsigned rn = INSTR (9, 5);
   1351   unsigned rt = INSTR (4, 0);
   1352 
   1353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1354   /* The target register may not be SP but the source may be.
   1355      There is no scaling required for a byte load.  */
   1356   aarch64_set_mem_u8 (cpu,
   1357 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
   1358 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
   1359 }
   1360 
   1361 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
   1362 static void
   1363 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   1364 {
   1365   unsigned rn = INSTR (9, 5);
   1366   unsigned rt = INSTR (4, 0);
   1367   uint64_t address;
   1368 
   1369   if (rn == rt && wb != NoWriteBack)
   1370     HALT_UNALLOC;
   1371 
   1372   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1373 
   1374   if (wb != Post)
   1375     address += offset;
   1376 
   1377   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1378   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
   1379 
   1380   if (wb == Post)
   1381     address += offset;
   1382 
   1383   if (wb != NoWriteBack)
   1384     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   1385 }
   1386 
   1387 /* 32 bit store byte scaled or unscaled zero-
   1388    or sign-extended 32-bit register offset.  */
   1389 static void
   1390 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1391 {
   1392   unsigned rm = INSTR (20, 16);
   1393   unsigned rn = INSTR (9, 5);
   1394   unsigned rt = INSTR (4, 0);
   1395   /* rn may reference SP, rm and rt must reference ZR  */
   1396 
   1397   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1398   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   1399 				 extension);
   1400 
   1401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1402   /* There is no scaling required for a byte load.  */
   1403   aarch64_set_mem_u8 (cpu, address + displacement,
   1404 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
   1405 }
   1406 
   1407 /* 32 bit store short scaled unsigned 12 bit.  */
   1408 static void
   1409 strh_abs (sim_cpu *cpu, uint32_t offset)
   1410 {
   1411   unsigned rn = INSTR (9, 5);
   1412   unsigned rt = INSTR (4, 0);
   1413 
   1414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1415   /* The target register may not be SP but the source may be.  */
   1416   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
   1417 		       + SCALE (offset, 16),
   1418 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
   1419 }
   1420 
   1421 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
   1422 static void
   1423 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   1424 {
   1425   unsigned rn = INSTR (9, 5);
   1426   unsigned rt = INSTR (4, 0);
   1427   uint64_t address;
   1428 
   1429   if (rn == rt && wb != NoWriteBack)
   1430     HALT_UNALLOC;
   1431 
   1432   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1433 
   1434   if (wb != Post)
   1435     address += offset;
   1436 
   1437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1438   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
   1439 
   1440   if (wb == Post)
   1441     address += offset;
   1442 
   1443   if (wb != NoWriteBack)
   1444     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   1445 }
   1446 
   1447 /* 32 bit store short scaled or unscaled zero-
   1448    or sign-extended 32-bit register offset.  */
   1449 static void
   1450 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1451 {
   1452   unsigned rm = INSTR (20, 16);
   1453   unsigned rn = INSTR (9, 5);
   1454   unsigned rt = INSTR (4, 0);
   1455   /* rn may reference SP, rm and rt must reference ZR  */
   1456 
   1457   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1458   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   1459   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
   1460 
   1461   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1462   aarch64_set_mem_u16 (cpu, address + displacement,
   1463 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
   1464 }
   1465 
   1466 /* Prefetch unsigned 12 bit.  */
   1467 static void
   1468 prfm_abs (sim_cpu *cpu, uint32_t offset)
   1469 {
   1470   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
   1471                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
   1472                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
   1473                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
   1474                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
   1475                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
   1476                           ow ==> UNALLOC
   1477      PrfOp prfop = prfop (instr, 4, 0);
   1478      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
   1479      + SCALE (offset, 64).  */
   1480 
   1481   /* TODO : implement prefetch of address.  */
   1482 }
   1483 
   1484 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
   1485 static void
   1486 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   1487 {
   1488   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
   1489                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
   1490                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
   1491                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
   1492                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
   1493                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
   1494                           ow ==> UNALLOC
   1495      rn may reference SP, rm may only reference ZR
   1496      PrfOp prfop = prfop (instr, 4, 0);
   1497      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1498      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   1499                                 extension);
   1500      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
   1501      uint64_t address = base + displacement.  */
   1502 
   1503   /* TODO : implement prefetch of address  */
   1504 }
   1505 
   1506 /* 64 bit pc-relative prefetch.  */
   1507 static void
   1508 prfm_pcrel (sim_cpu *cpu, int32_t offset)
   1509 {
   1510   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
   1511                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
   1512                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
   1513                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
   1514                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
   1515                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
   1516                           ow ==> UNALLOC
   1517      PrfOp prfop = prfop (instr, 4, 0);
   1518      uint64_t address = aarch64_get_PC (cpu) + offset.  */
   1519 
   1520   /* TODO : implement this  */
   1521 }
   1522 
   1523 /* Load-store exclusive.  */
   1524 
   1525 static void
   1526 ldxr (sim_cpu *cpu)
   1527 {
   1528   unsigned rn = INSTR (9, 5);
   1529   unsigned rt = INSTR (4, 0);
   1530   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1531   int size = INSTR (31, 30);
   1532   /* int ordered = INSTR (15, 15);  */
   1533   /* int exclusive = ! INSTR (23, 23);  */
   1534 
   1535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1536   switch (size)
   1537     {
   1538     case 0:
   1539       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
   1540       break;
   1541     case 1:
   1542       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
   1543       break;
   1544     case 2:
   1545       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
   1546       break;
   1547     case 3:
   1548       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
   1549       break;
   1550     }
   1551 }
   1552 
   1553 static void
   1554 stxr (sim_cpu *cpu)
   1555 {
   1556   unsigned rn = INSTR (9, 5);
   1557   unsigned rt = INSTR (4, 0);
   1558   unsigned rs = INSTR (20, 16);
   1559   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1560   int      size = INSTR (31, 30);
   1561   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
   1562 
   1563   switch (size)
   1564     {
   1565     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
   1566     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
   1567     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
   1568     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
   1569     }
   1570 
   1571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1572   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
   1573 }
   1574 
   1575 static void
   1576 dexLoadLiteral (sim_cpu *cpu)
   1577 {
   1578   /* instr[29,27] == 011
   1579      instr[25,24] == 00
   1580      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
   1581                             010 ==> LDRX,  011 ==> FLDRD
   1582                             100 ==> LDRSW, 101 ==> FLDRQ
   1583                             110 ==> PRFM, 111 ==> UNALLOC
   1584      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
   1585      instr[23, 5] == simm19  */
   1586 
   1587   /* unsigned rt = INSTR (4, 0);  */
   1588   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
   1589   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
   1590 
   1591   switch (dispatch)
   1592     {
   1593     case 0: ldr32_pcrel (cpu, imm); break;
   1594     case 1: fldrs_pcrel (cpu, imm); break;
   1595     case 2: ldr_pcrel   (cpu, imm); break;
   1596     case 3: fldrd_pcrel (cpu, imm); break;
   1597     case 4: ldrsw_pcrel (cpu, imm); break;
   1598     case 5: fldrq_pcrel (cpu, imm); break;
   1599     case 6: prfm_pcrel  (cpu, imm); break;
   1600     case 7:
   1601     default:
   1602       HALT_UNALLOC;
   1603     }
   1604 }
   1605 
   1606 /* Immediate arithmetic
   1607    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
   1608    value left shifted by 12 bits (done at decode).
   1609 
   1610    N.B. the register args (dest, source) can normally be Xn or SP.
   1611    the exception occurs for flag setting instructions which may
   1612    only use Xn for the output (dest).  */
   1613 
   1614 /* 32 bit add immediate.  */
   1615 static void
   1616 add32 (sim_cpu *cpu, uint32_t aimm)
   1617 {
   1618   unsigned rn = INSTR (9, 5);
   1619   unsigned rd = INSTR (4, 0);
   1620 
   1621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1622   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   1623 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
   1624 }
   1625 
   1626 /* 64 bit add immediate.  */
   1627 static void
   1628 add64 (sim_cpu *cpu, uint32_t aimm)
   1629 {
   1630   unsigned rn = INSTR (9, 5);
   1631   unsigned rd = INSTR (4, 0);
   1632 
   1633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1634   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   1635 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
   1636 }
   1637 
   1638 static void
   1639 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
   1640 {
   1641   int32_t   result = value1 + value2;
   1642   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
   1643   uint64_t  uresult = (uint64_t)(uint32_t) value1
   1644     + (uint64_t)(uint32_t) value2;
   1645   uint32_t  flags = 0;
   1646 
   1647   if (result == 0)
   1648     flags |= Z;
   1649 
   1650   if (result & (1 << 31))
   1651     flags |= N;
   1652 
   1653   if (uresult != (uint32_t)result)
   1654     flags |= C;
   1655 
   1656   if (sresult != result)
   1657     flags |= V;
   1658 
   1659   aarch64_set_CPSR (cpu, flags);
   1660 }
   1661 
   1662 #define NEG(a) (((a) & signbit) == signbit)
   1663 #define POS(a) (((a) & signbit) == 0)
   1664 
   1665 static void
   1666 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
   1667 {
   1668   uint64_t result = value1 + value2;
   1669   uint32_t flags = 0;
   1670   uint64_t signbit = 1ULL << 63;
   1671 
   1672   if (result == 0)
   1673     flags |= Z;
   1674 
   1675   if (NEG (result))
   1676     flags |= N;
   1677 
   1678   if (   (NEG (value1) && NEG (value2))
   1679       || (NEG (value1) && POS (result))
   1680       || (NEG (value2) && POS (result)))
   1681     flags |= C;
   1682 
   1683   if (   (NEG (value1) && NEG (value2) && POS (result))
   1684       || (POS (value1) && POS (value2) && NEG (result)))
   1685     flags |= V;
   1686 
   1687   aarch64_set_CPSR (cpu, flags);
   1688 }
   1689 
   1690 static void
   1691 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
   1692 {
   1693   uint32_t result = value1 - value2;
   1694   uint32_t flags = 0;
   1695   uint32_t signbit = 1U << 31;
   1696 
   1697   if (result == 0)
   1698     flags |= Z;
   1699 
   1700   if (NEG (result))
   1701     flags |= N;
   1702 
   1703   if (   (NEG (value1) && POS (value2))
   1704       || (NEG (value1) && POS (result))
   1705       || (POS (value2) && POS (result)))
   1706     flags |= C;
   1707 
   1708   if (   (NEG (value1) && POS (value2) && POS (result))
   1709       || (POS (value1) && NEG (value2) && NEG (result)))
   1710     flags |= V;
   1711 
   1712   aarch64_set_CPSR (cpu, flags);
   1713 }
   1714 
   1715 static void
   1716 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
   1717 {
   1718   uint64_t result = value1 - value2;
   1719   uint32_t flags = 0;
   1720   uint64_t signbit = 1ULL << 63;
   1721 
   1722   if (result == 0)
   1723     flags |= Z;
   1724 
   1725   if (NEG (result))
   1726     flags |= N;
   1727 
   1728   if (   (NEG (value1) && POS (value2))
   1729       || (NEG (value1) && POS (result))
   1730       || (POS (value2) && POS (result)))
   1731     flags |= C;
   1732 
   1733   if (   (NEG (value1) && POS (value2) && POS (result))
   1734       || (POS (value1) && NEG (value2) && NEG (result)))
   1735     flags |= V;
   1736 
   1737   aarch64_set_CPSR (cpu, flags);
   1738 }
   1739 
   1740 static void
   1741 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
   1742 {
   1743   uint32_t flags = 0;
   1744 
   1745   if (result == 0)
   1746     flags |= Z;
   1747   else
   1748     flags &= ~ Z;
   1749 
   1750   if (result & (1 << 31))
   1751     flags |= N;
   1752   else
   1753     flags &= ~ N;
   1754 
   1755   aarch64_set_CPSR (cpu, flags);
   1756 }
   1757 
   1758 static void
   1759 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
   1760 {
   1761   uint32_t flags = 0;
   1762 
   1763   if (result == 0)
   1764     flags |= Z;
   1765   else
   1766     flags &= ~ Z;
   1767 
   1768   if (result & (1ULL << 63))
   1769     flags |= N;
   1770   else
   1771     flags &= ~ N;
   1772 
   1773   aarch64_set_CPSR (cpu, flags);
   1774 }
   1775 
   1776 /* 32 bit add immediate set flags.  */
   1777 static void
   1778 adds32 (sim_cpu *cpu, uint32_t aimm)
   1779 {
   1780   unsigned rn = INSTR (9, 5);
   1781   unsigned rd = INSTR (4, 0);
   1782   /* TODO : do we need to worry about signs here?  */
   1783   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
   1784 
   1785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1786   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
   1787   set_flags_for_add32 (cpu, value1, aimm);
   1788 }
   1789 
   1790 /* 64 bit add immediate set flags.  */
   1791 static void
   1792 adds64 (sim_cpu *cpu, uint32_t aimm)
   1793 {
   1794   unsigned rn = INSTR (9, 5);
   1795   unsigned rd = INSTR (4, 0);
   1796   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1797   uint64_t value2 = aimm;
   1798 
   1799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1800   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   1801   set_flags_for_add64 (cpu, value1, value2);
   1802 }
   1803 
   1804 /* 32 bit sub immediate.  */
   1805 static void
   1806 sub32 (sim_cpu *cpu, uint32_t aimm)
   1807 {
   1808   unsigned rn = INSTR (9, 5);
   1809   unsigned rd = INSTR (4, 0);
   1810 
   1811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1812   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   1813 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
   1814 }
   1815 
   1816 /* 64 bit sub immediate.  */
   1817 static void
   1818 sub64 (sim_cpu *cpu, uint32_t aimm)
   1819 {
   1820   unsigned rn = INSTR (9, 5);
   1821   unsigned rd = INSTR (4, 0);
   1822 
   1823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1824   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   1825 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
   1826 }
   1827 
   1828 /* 32 bit sub immediate set flags.  */
   1829 static void
   1830 subs32 (sim_cpu *cpu, uint32_t aimm)
   1831 {
   1832   unsigned rn = INSTR (9, 5);
   1833   unsigned rd = INSTR (4, 0);
   1834   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1835   uint32_t value2 = aimm;
   1836 
   1837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1838   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   1839   set_flags_for_sub32 (cpu, value1, value2);
   1840 }
   1841 
   1842 /* 64 bit sub immediate set flags.  */
   1843 static void
   1844 subs64 (sim_cpu *cpu, uint32_t aimm)
   1845 {
   1846   unsigned rn = INSTR (9, 5);
   1847   unsigned rd = INSTR (4, 0);
   1848   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   1849   uint32_t value2 = aimm;
   1850 
   1851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1852   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   1853   set_flags_for_sub64 (cpu, value1, value2);
   1854 }
   1855 
   1856 /* Data Processing Register.  */
   1857 
   1858 /* First two helpers to perform the shift operations.  */
   1859 
   1860 static inline uint32_t
   1861 shifted32 (uint32_t value, Shift shift, uint32_t count)
   1862 {
   1863   switch (shift)
   1864     {
   1865     default:
   1866     case LSL:
   1867       return (value << count);
   1868     case LSR:
   1869       return (value >> count);
   1870     case ASR:
   1871       {
   1872 	int32_t svalue = value;
   1873 	return (svalue >> count);
   1874       }
   1875     case ROR:
   1876       {
   1877 	uint32_t top = value >> count;
   1878 	uint32_t bottom = value << (32 - count);
   1879 	return (bottom | top);
   1880       }
   1881     }
   1882 }
   1883 
   1884 static inline uint64_t
   1885 shifted64 (uint64_t value, Shift shift, uint32_t count)
   1886 {
   1887   switch (shift)
   1888     {
   1889     default:
   1890     case LSL:
   1891       return (value << count);
   1892     case LSR:
   1893       return (value >> count);
   1894     case ASR:
   1895       {
   1896 	int64_t svalue = value;
   1897 	return (svalue >> count);
   1898       }
   1899     case ROR:
   1900       {
   1901 	uint64_t top = value >> count;
   1902 	uint64_t bottom = value << (64 - count);
   1903 	return (bottom | top);
   1904       }
   1905     }
   1906 }
   1907 
   1908 /* Arithmetic shifted register.
   1909    These allow an optional LSL, ASR or LSR to the second source
   1910    register with a count up to the register bit count.
   1911 
   1912    N.B register args may not be SP.  */
   1913 
   1914 /* 32 bit ADD shifted register.  */
   1915 static void
   1916 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   1917 {
   1918   unsigned rm = INSTR (20, 16);
   1919   unsigned rn = INSTR (9, 5);
   1920   unsigned rd = INSTR (4, 0);
   1921 
   1922   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1923   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   1924 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
   1925 		       + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   1926 				    shift, count));
   1927 }
   1928 
   1929 /* 64 bit ADD shifted register.  */
   1930 static void
   1931 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   1932 {
   1933   unsigned rm = INSTR (20, 16);
   1934   unsigned rn = INSTR (9, 5);
   1935   unsigned rd = INSTR (4, 0);
   1936 
   1937   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1938   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   1939 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
   1940 		       + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
   1941 				    shift, count));
   1942 }
   1943 
   1944 /* 32 bit ADD shifted register setting flags.  */
   1945 static void
   1946 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   1947 {
   1948   unsigned rm = INSTR (20, 16);
   1949   unsigned rn = INSTR (9, 5);
   1950   unsigned rd = INSTR (4, 0);
   1951 
   1952   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   1953   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   1954 			       shift, count);
   1955 
   1956   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1957   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   1958   set_flags_for_add32 (cpu, value1, value2);
   1959 }
   1960 
   1961 /* 64 bit ADD shifted register setting flags.  */
   1962 static void
   1963 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   1964 {
   1965   unsigned rm = INSTR (20, 16);
   1966   unsigned rn = INSTR (9, 5);
   1967   unsigned rd = INSTR (4, 0);
   1968 
   1969   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   1970   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
   1971 			       shift, count);
   1972 
   1973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1974   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   1975   set_flags_for_add64 (cpu, value1, value2);
   1976 }
   1977 
   1978 /* 32 bit SUB shifted register.  */
   1979 static void
   1980 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   1981 {
   1982   unsigned rm = INSTR (20, 16);
   1983   unsigned rn = INSTR (9, 5);
   1984   unsigned rd = INSTR (4, 0);
   1985 
   1986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   1987   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   1988 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
   1989 		       - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   1990 				    shift, count));
   1991 }
   1992 
   1993 /* 64 bit SUB shifted register.  */
   1994 static void
   1995 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   1996 {
   1997   unsigned rm = INSTR (20, 16);
   1998   unsigned rn = INSTR (9, 5);
   1999   unsigned rd = INSTR (4, 0);
   2000 
   2001   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2002   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   2003 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
   2004 		       - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
   2005 				    shift, count));
   2006 }
   2007 
   2008 /* 32 bit SUB shifted register setting flags.  */
   2009 static void
   2010 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   2011 {
   2012   unsigned rm = INSTR (20, 16);
   2013   unsigned rn = INSTR (9, 5);
   2014   unsigned rd = INSTR (4, 0);
   2015 
   2016   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   2017   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   2018 			      shift, count);
   2019 
   2020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2021   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   2022   set_flags_for_sub32 (cpu, value1, value2);
   2023 }
   2024 
   2025 /* 64 bit SUB shifted register setting flags.  */
   2026 static void
   2027 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   2028 {
   2029   unsigned rm = INSTR (20, 16);
   2030   unsigned rn = INSTR (9, 5);
   2031   unsigned rd = INSTR (4, 0);
   2032 
   2033   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   2034   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
   2035 			       shift, count);
   2036 
   2037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2038   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   2039   set_flags_for_sub64 (cpu, value1, value2);
   2040 }
   2041 
   2042 /* First a couple more helpers to fetch the
   2043    relevant source register element either
   2044    sign or zero extended as required by the
   2045    extension value.  */
   2046 
   2047 static uint32_t
   2048 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
   2049 {
   2050   switch (extension)
   2051     {
   2052     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
   2053     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
   2054     case UXTW: /* Fall through.  */
   2055     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
   2056     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
   2057     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
   2058     case SXTW: /* Fall through.  */
   2059     case SXTX: /* Fall through.  */
   2060     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
   2061   }
   2062 }
   2063 
   2064 static uint64_t
   2065 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
   2066 {
   2067   switch (extension)
   2068     {
   2069     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
   2070     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
   2071     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
   2072     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
   2073     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
   2074     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
   2075     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
   2076     case SXTX:
   2077     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
   2078     }
   2079 }
   2080 
   2081 /* Arithmetic extending register
   2082    These allow an optional sign extension of some portion of the
   2083    second source register followed by an optional left shift of
   2084    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
   2085 
   2086    N.B output (dest) and first input arg (source) may normally be Xn
   2087    or SP. However, for flag setting operations dest can only be
   2088    Xn. Second input registers are always Xn.  */
   2089 
   2090 /* 32 bit ADD extending register.  */
   2091 static void
   2092 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
   2093 {
   2094   unsigned rm = INSTR (20, 16);
   2095   unsigned rn = INSTR (9, 5);
   2096   unsigned rd = INSTR (4, 0);
   2097 
   2098   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2099   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   2100 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
   2101 		       + (extreg32 (cpu, rm, extension) << shift));
   2102 }
   2103 
   2104 /* 64 bit ADD extending register.
   2105    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
   2106 static void
   2107 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
   2108 {
   2109   unsigned rm = INSTR (20, 16);
   2110   unsigned rn = INSTR (9, 5);
   2111   unsigned rd = INSTR (4, 0);
   2112 
   2113   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2114   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   2115 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
   2116 		       + (extreg64 (cpu, rm, extension) << shift));
   2117 }
   2118 
   2119 /* 32 bit ADD extending register setting flags.  */
   2120 static void
   2121 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
   2122 {
   2123   unsigned rm = INSTR (20, 16);
   2124   unsigned rn = INSTR (9, 5);
   2125   unsigned rd = INSTR (4, 0);
   2126 
   2127   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
   2128   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
   2129 
   2130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   2132   set_flags_for_add32 (cpu, value1, value2);
   2133 }
   2134 
   2135 /* 64 bit ADD extending register setting flags  */
   2136 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
   2137 static void
   2138 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
   2139 {
   2140   unsigned rm = INSTR (20, 16);
   2141   unsigned rn = INSTR (9, 5);
   2142   unsigned rd = INSTR (4, 0);
   2143 
   2144   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   2145   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
   2146 
   2147   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2148   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   2149   set_flags_for_add64 (cpu, value1, value2);
   2150 }
   2151 
   2152 /* 32 bit SUB extending register.  */
   2153 static void
   2154 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
   2155 {
   2156   unsigned rm = INSTR (20, 16);
   2157   unsigned rn = INSTR (9, 5);
   2158   unsigned rd = INSTR (4, 0);
   2159 
   2160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2161   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   2162 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
   2163 		       - (extreg32 (cpu, rm, extension) << shift));
   2164 }
   2165 
   2166 /* 64 bit SUB extending register.  */
   2167 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
   2168 static void
   2169 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
   2170 {
   2171   unsigned rm = INSTR (20, 16);
   2172   unsigned rn = INSTR (9, 5);
   2173   unsigned rd = INSTR (4, 0);
   2174 
   2175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2176   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   2177 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
   2178 		       - (extreg64 (cpu, rm, extension) << shift));
   2179 }
   2180 
   2181 /* 32 bit SUB extending register setting flags.  */
   2182 static void
   2183 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
   2184 {
   2185   unsigned rm = INSTR (20, 16);
   2186   unsigned rn = INSTR (9, 5);
   2187   unsigned rd = INSTR (4, 0);
   2188 
   2189   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
   2190   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
   2191 
   2192   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2193   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   2194   set_flags_for_sub32 (cpu, value1, value2);
   2195 }
   2196 
   2197 /* 64 bit SUB extending register setting flags  */
   2198 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
   2199 static void
   2200 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
   2201 {
   2202   unsigned rm = INSTR (20, 16);
   2203   unsigned rn = INSTR (9, 5);
   2204   unsigned rd = INSTR (4, 0);
   2205 
   2206   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   2207   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
   2208 
   2209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2210   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   2211   set_flags_for_sub64 (cpu, value1, value2);
   2212 }
   2213 
   2214 static void
   2215 dexAddSubtractImmediate (sim_cpu *cpu)
   2216 {
   2217   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   2218      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
   2219      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
   2220      instr[28,24] = 10001
   2221      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
   2222      instr[21,10] = uimm12
   2223      instr[9,5]   = Rn
   2224      instr[4,0]   = Rd  */
   2225 
   2226   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
   2227   uint32_t shift = INSTR (23, 22);
   2228   uint32_t imm = INSTR (21, 10);
   2229   uint32_t dispatch = INSTR (31, 29);
   2230 
   2231   NYI_assert (28, 24, 0x11);
   2232 
   2233   if (shift > 1)
   2234     HALT_UNALLOC;
   2235 
   2236   if (shift)
   2237     imm <<= 12;
   2238 
   2239   switch (dispatch)
   2240     {
   2241     case 0: add32 (cpu, imm); break;
   2242     case 1: adds32 (cpu, imm); break;
   2243     case 2: sub32 (cpu, imm); break;
   2244     case 3: subs32 (cpu, imm); break;
   2245     case 4: add64 (cpu, imm); break;
   2246     case 5: adds64 (cpu, imm); break;
   2247     case 6: sub64 (cpu, imm); break;
   2248     case 7: subs64 (cpu, imm); break;
   2249     }
   2250 }
   2251 
   2252 static void
   2253 dexAddSubtractShiftedRegister (sim_cpu *cpu)
   2254 {
   2255   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   2256      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
   2257      instr[28,24] = 01011
   2258      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
   2259      instr[21]    = 0
   2260      instr[20,16] = Rm
   2261      instr[15,10] = count : must be 0xxxxx for 32 bit
   2262      instr[9,5]   = Rn
   2263      instr[4,0]   = Rd  */
   2264 
   2265   uint32_t size = INSTR (31, 31);
   2266   uint32_t count = INSTR (15, 10);
   2267   Shift shiftType = INSTR (23, 22);
   2268 
   2269   NYI_assert (28, 24, 0x0B);
   2270   NYI_assert (21, 21, 0);
   2271 
   2272   /* Shift encoded as ROR is unallocated.  */
   2273   if (shiftType == ROR)
   2274     HALT_UNALLOC;
   2275 
   2276   /* 32 bit operations must have count[5] = 0
   2277      or else we have an UNALLOC.  */
   2278   if (size == 0 && uimm (count, 5, 5))
   2279     HALT_UNALLOC;
   2280 
   2281   /* Dispatch on size:op i.e instr [31,29].  */
   2282   switch (INSTR (31, 29))
   2283     {
   2284     case 0: add32_shift  (cpu, shiftType, count); break;
   2285     case 1: adds32_shift (cpu, shiftType, count); break;
   2286     case 2: sub32_shift  (cpu, shiftType, count); break;
   2287     case 3: subs32_shift (cpu, shiftType, count); break;
   2288     case 4: add64_shift  (cpu, shiftType, count); break;
   2289     case 5: adds64_shift (cpu, shiftType, count); break;
   2290     case 6: sub64_shift  (cpu, shiftType, count); break;
   2291     case 7: subs64_shift (cpu, shiftType, count); break;
   2292     }
   2293 }
   2294 
   2295 static void
   2296 dexAddSubtractExtendedRegister (sim_cpu *cpu)
   2297 {
   2298   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   2299      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
   2300      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
   2301      instr[28,24] = 01011
   2302      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
   2303      instr[21]    = 1
   2304      instr[20,16] = Rm
   2305      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
   2306                              000 ==> LSL|UXTW, 001 ==> UXTZ,
   2307                              000 ==> SXTB, 001 ==> SXTH,
   2308                              000 ==> SXTW, 001 ==> SXTX,
   2309      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
   2310      instr[9,5]   = Rn
   2311      instr[4,0]   = Rd  */
   2312 
   2313   Extension extensionType = INSTR (15, 13);
   2314   uint32_t shift = INSTR (12, 10);
   2315 
   2316   NYI_assert (28, 24, 0x0B);
   2317   NYI_assert (21, 21, 1);
   2318 
   2319   /* Shift may not exceed 4.  */
   2320   if (shift > 4)
   2321     HALT_UNALLOC;
   2322 
   2323   /* Dispatch on size:op:set?.  */
   2324   switch (INSTR (31, 29))
   2325     {
   2326     case 0: add32_ext  (cpu, extensionType, shift); break;
   2327     case 1: adds32_ext (cpu, extensionType, shift); break;
   2328     case 2: sub32_ext  (cpu, extensionType, shift); break;
   2329     case 3: subs32_ext (cpu, extensionType, shift); break;
   2330     case 4: add64_ext  (cpu, extensionType, shift); break;
   2331     case 5: adds64_ext (cpu, extensionType, shift); break;
   2332     case 6: sub64_ext  (cpu, extensionType, shift); break;
   2333     case 7: subs64_ext (cpu, extensionType, shift); break;
   2334     }
   2335 }
   2336 
   2337 /* Conditional data processing
   2338    Condition register is implicit 3rd source.  */
   2339 
   2340 /* 32 bit add with carry.  */
   2341 /* N.B register args may not be SP.  */
   2342 
   2343 static void
   2344 adc32 (sim_cpu *cpu)
   2345 {
   2346   unsigned rm = INSTR (20, 16);
   2347   unsigned rn = INSTR (9, 5);
   2348   unsigned rd = INSTR (4, 0);
   2349 
   2350   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2351   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   2352 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
   2353 		       + aarch64_get_reg_u32 (cpu, rm, NO_SP)
   2354 		       + IS_SET (C));
   2355 }
   2356 
   2357 /* 64 bit add with carry  */
   2358 static void
   2359 adc64 (sim_cpu *cpu)
   2360 {
   2361   unsigned rm = INSTR (20, 16);
   2362   unsigned rn = INSTR (9, 5);
   2363   unsigned rd = INSTR (4, 0);
   2364 
   2365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   2367 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
   2368 		       + aarch64_get_reg_u64 (cpu, rm, NO_SP)
   2369 		       + IS_SET (C));
   2370 }
   2371 
   2372 /* 32 bit add with carry setting flags.  */
   2373 static void
   2374 adcs32 (sim_cpu *cpu)
   2375 {
   2376   unsigned rm = INSTR (20, 16);
   2377   unsigned rn = INSTR (9, 5);
   2378   unsigned rd = INSTR (4, 0);
   2379 
   2380   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   2381   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
   2382   uint32_t carry = IS_SET (C);
   2383 
   2384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2385   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
   2386   set_flags_for_add32 (cpu, value1, value2 + carry);
   2387 }
   2388 
   2389 /* 64 bit add with carry setting flags.  */
   2390 static void
   2391 adcs64 (sim_cpu *cpu)
   2392 {
   2393   unsigned rm = INSTR (20, 16);
   2394   unsigned rn = INSTR (9, 5);
   2395   unsigned rd = INSTR (4, 0);
   2396 
   2397   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   2398   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
   2399   uint64_t carry = IS_SET (C);
   2400 
   2401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2402   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
   2403   set_flags_for_add64 (cpu, value1, value2 + carry);
   2404 }
   2405 
   2406 /* 32 bit sub with carry.  */
   2407 static void
   2408 sbc32 (sim_cpu *cpu)
   2409 {
   2410   unsigned rm = INSTR (20, 16);
   2411   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
   2412   unsigned rd = INSTR (4, 0);
   2413 
   2414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2415   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   2416 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
   2417 		       - aarch64_get_reg_u32 (cpu, rm, NO_SP)
   2418 		       - 1 + IS_SET (C));
   2419 }
   2420 
   2421 /* 64 bit sub with carry  */
   2422 static void
   2423 sbc64 (sim_cpu *cpu)
   2424 {
   2425   unsigned rm = INSTR (20, 16);
   2426   unsigned rn = INSTR (9, 5);
   2427   unsigned rd = INSTR (4, 0);
   2428 
   2429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2430   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   2431 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
   2432 		       - aarch64_get_reg_u64 (cpu, rm, NO_SP)
   2433 		       - 1 + IS_SET (C));
   2434 }
   2435 
   2436 /* 32 bit sub with carry setting flags  */
   2437 static void
   2438 sbcs32 (sim_cpu *cpu)
   2439 {
   2440   unsigned rm = INSTR (20, 16);
   2441   unsigned rn = INSTR (9, 5);
   2442   unsigned rd = INSTR (4, 0);
   2443 
   2444   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   2445   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
   2446   uint32_t carry  = IS_SET (C);
   2447   uint32_t result = value1 - value2 + 1 - carry;
   2448 
   2449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2450   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   2451   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
   2452 }
   2453 
   2454 /* 64 bit sub with carry setting flags  */
   2455 static void
   2456 sbcs64 (sim_cpu *cpu)
   2457 {
   2458   unsigned rm = INSTR (20, 16);
   2459   unsigned rn = INSTR (9, 5);
   2460   unsigned rd = INSTR (4, 0);
   2461 
   2462   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   2463   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
   2464   uint64_t carry  = IS_SET (C);
   2465   uint64_t result = value1 - value2 + 1 - carry;
   2466 
   2467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2468   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   2469   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
   2470 }
   2471 
   2472 static void
   2473 dexAddSubtractWithCarry (sim_cpu *cpu)
   2474 {
   2475   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   2476      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
   2477      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
   2478      instr[28,21] = 1 1010 000
   2479      instr[20,16] = Rm
   2480      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
   2481      instr[9,5]   = Rn
   2482      instr[4,0]   = Rd  */
   2483 
   2484   uint32_t op2 = INSTR (15, 10);
   2485 
   2486   NYI_assert (28, 21, 0xD0);
   2487 
   2488   if (op2 != 0)
   2489     HALT_UNALLOC;
   2490 
   2491   /* Dispatch on size:op:set?.  */
   2492   switch (INSTR (31, 29))
   2493     {
   2494     case 0: adc32 (cpu); break;
   2495     case 1: adcs32 (cpu); break;
   2496     case 2: sbc32 (cpu); break;
   2497     case 3: sbcs32 (cpu); break;
   2498     case 4: adc64 (cpu); break;
   2499     case 5: adcs64 (cpu); break;
   2500     case 6: sbc64 (cpu); break;
   2501     case 7: sbcs64 (cpu); break;
   2502     }
   2503 }
   2504 
   2505 static uint32_t
   2506 testConditionCode (sim_cpu *cpu, CondCode cc)
   2507 {
   2508   /* This should be reduceable to branchless logic
   2509      by some careful testing of bits in CC followed
   2510      by the requisite masking and combining of bits
   2511      from the flag register.
   2512 
   2513      For now we do it with a switch.  */
   2514   int res;
   2515 
   2516   switch (cc)
   2517     {
   2518     case EQ:  res = IS_SET (Z);    break;
   2519     case NE:  res = IS_CLEAR (Z);  break;
   2520     case CS:  res = IS_SET (C);    break;
   2521     case CC:  res = IS_CLEAR (C);  break;
   2522     case MI:  res = IS_SET (N);    break;
   2523     case PL:  res = IS_CLEAR (N);  break;
   2524     case VS:  res = IS_SET (V);    break;
   2525     case VC:  res = IS_CLEAR (V);  break;
   2526     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
   2527     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
   2528     case GE:  res = IS_SET (N) == IS_SET (V);    break;
   2529     case LT:  res = IS_SET (N) != IS_SET (V);    break;
   2530     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
   2531     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
   2532     case AL:
   2533     case NV:
   2534     default:
   2535       res = 1;
   2536       break;
   2537     }
   2538   return res;
   2539 }
   2540 
   2541 static void
   2542 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
   2543 {
   2544   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   2545      instr[30]    = compare with positive (1) or negative value (0)
   2546      instr[29,21] = 1 1101 0010
   2547      instr[20,16] = Rm or const
   2548      instr[15,12] = cond
   2549      instr[11]    = compare reg (0) or const (1)
   2550      instr[10]    = 0
   2551      instr[9,5]   = Rn
   2552      instr[4]     = 0
   2553      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
   2554   signed int negate;
   2555   unsigned rm;
   2556   unsigned rn;
   2557 
   2558   NYI_assert (29, 21, 0x1d2);
   2559   NYI_assert (10, 10, 0);
   2560   NYI_assert (4, 4, 0);
   2561 
   2562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2563   if (! testConditionCode (cpu, INSTR (15, 12)))
   2564     {
   2565       aarch64_set_CPSR (cpu, INSTR (3, 0));
   2566       return;
   2567     }
   2568 
   2569   negate = INSTR (30, 30) ? 1 : -1;
   2570   rm = INSTR (20, 16);
   2571   rn = INSTR ( 9,  5);
   2572 
   2573   if (INSTR (31, 31))
   2574     {
   2575       if (INSTR (11, 11))
   2576 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
   2577 			     negate * (uint64_t) rm);
   2578       else
   2579 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
   2580 			     negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
   2581     }
   2582   else
   2583     {
   2584       if (INSTR (11, 11))
   2585 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
   2586 			     negate * rm);
   2587       else
   2588 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
   2589 			     negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
   2590     }
   2591 }
   2592 
   2593 static void
   2594 do_vec_MOV_whole_vector (sim_cpu *cpu)
   2595 {
   2596   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
   2597 
   2598      instr[31]    = 0
   2599      instr[30]    = half(0)/full(1)
   2600      instr[29,21] = 001110101
   2601      instr[20,16] = Vs
   2602      instr[15,10] = 000111
   2603      instr[9,5]   = Vs
   2604      instr[4,0]   = Vd  */
   2605 
   2606   unsigned vs = INSTR (9, 5);
   2607   unsigned vd = INSTR (4, 0);
   2608 
   2609   NYI_assert (29, 21, 0x075);
   2610   NYI_assert (15, 10, 0x07);
   2611 
   2612   if (INSTR (20, 16) != vs)
   2613     HALT_NYI;
   2614 
   2615   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2616   if (INSTR (30, 30))
   2617     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
   2618 
   2619   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
   2620 }
   2621 
   2622 static void
   2623 do_vec_SMOV_into_scalar (sim_cpu *cpu)
   2624 {
   2625   /* instr[31]    = 0
   2626      instr[30]    = word(0)/long(1)
   2627      instr[29,21] = 00 1110 000
   2628      instr[20,16] = element size and index
   2629      instr[15,10] = 00 0010 11
   2630      instr[9,5]   = V source
   2631      instr[4,0]   = R dest  */
   2632 
   2633   unsigned vs = INSTR (9, 5);
   2634   unsigned rd = INSTR (4, 0);
   2635   unsigned imm5 = INSTR (20, 16);
   2636   unsigned full = INSTR (30, 30);
   2637   int size, index;
   2638 
   2639   NYI_assert (29, 21, 0x070);
   2640   NYI_assert (15, 10, 0x0B);
   2641 
   2642   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2643 
   2644   if (imm5 & 0x1)
   2645     {
   2646       size = 0;
   2647       index = (imm5 >> 1) & 0xF;
   2648     }
   2649   else if (imm5 & 0x2)
   2650     {
   2651       size = 1;
   2652       index = (imm5 >> 2) & 0x7;
   2653     }
   2654   else if (full && (imm5 & 0x4))
   2655     {
   2656       size = 2;
   2657       index = (imm5 >> 3) & 0x3;
   2658     }
   2659   else
   2660     HALT_UNALLOC;
   2661 
   2662   switch (size)
   2663     {
   2664     case 0:
   2665       if (full)
   2666 	aarch64_set_reg_s64 (cpu, rd, NO_SP,
   2667 			     aarch64_get_vec_s8 (cpu, vs, index));
   2668       else
   2669 	aarch64_set_reg_s32 (cpu, rd, NO_SP,
   2670 			     aarch64_get_vec_s8 (cpu, vs, index));
   2671       break;
   2672 
   2673     case 1:
   2674       if (full)
   2675 	aarch64_set_reg_s64 (cpu, rd, NO_SP,
   2676 			     aarch64_get_vec_s16 (cpu, vs, index));
   2677       else
   2678 	aarch64_set_reg_s32 (cpu, rd, NO_SP,
   2679 			     aarch64_get_vec_s16 (cpu, vs, index));
   2680       break;
   2681 
   2682     case 2:
   2683       aarch64_set_reg_s64 (cpu, rd, NO_SP,
   2684 			   aarch64_get_vec_s32 (cpu, vs, index));
   2685       break;
   2686 
   2687     default:
   2688       HALT_UNALLOC;
   2689     }
   2690 }
   2691 
   2692 static void
   2693 do_vec_UMOV_into_scalar (sim_cpu *cpu)
   2694 {
   2695   /* instr[31]    = 0
   2696      instr[30]    = word(0)/long(1)
   2697      instr[29,21] = 00 1110 000
   2698      instr[20,16] = element size and index
   2699      instr[15,10] = 00 0011 11
   2700      instr[9,5]   = V source
   2701      instr[4,0]   = R dest  */
   2702 
   2703   unsigned vs = INSTR (9, 5);
   2704   unsigned rd = INSTR (4, 0);
   2705   unsigned imm5 = INSTR (20, 16);
   2706   unsigned full = INSTR (30, 30);
   2707   int size, index;
   2708 
   2709   NYI_assert (29, 21, 0x070);
   2710   NYI_assert (15, 10, 0x0F);
   2711 
   2712   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2713 
   2714   if (!full)
   2715     {
   2716       if (imm5 & 0x1)
   2717 	{
   2718 	  size = 0;
   2719 	  index = (imm5 >> 1) & 0xF;
   2720 	}
   2721       else if (imm5 & 0x2)
   2722 	{
   2723 	  size = 1;
   2724 	  index = (imm5 >> 2) & 0x7;
   2725 	}
   2726       else if (imm5 & 0x4)
   2727 	{
   2728 	  size = 2;
   2729 	  index = (imm5 >> 3) & 0x3;
   2730 	}
   2731       else
   2732 	HALT_UNALLOC;
   2733     }
   2734   else if (imm5 & 0x8)
   2735     {
   2736       size = 3;
   2737       index = (imm5 >> 4) & 0x1;
   2738     }
   2739   else
   2740     HALT_UNALLOC;
   2741 
   2742   switch (size)
   2743     {
   2744     case 0:
   2745       aarch64_set_reg_u32 (cpu, rd, NO_SP,
   2746 			   aarch64_get_vec_u8 (cpu, vs, index));
   2747       break;
   2748 
   2749     case 1:
   2750       aarch64_set_reg_u32 (cpu, rd, NO_SP,
   2751 			   aarch64_get_vec_u16 (cpu, vs, index));
   2752       break;
   2753 
   2754     case 2:
   2755       aarch64_set_reg_u32 (cpu, rd, NO_SP,
   2756 			   aarch64_get_vec_u32 (cpu, vs, index));
   2757       break;
   2758 
   2759     case 3:
   2760       aarch64_set_reg_u64 (cpu, rd, NO_SP,
   2761 			   aarch64_get_vec_u64 (cpu, vs, index));
   2762       break;
   2763 
   2764     default:
   2765       HALT_UNALLOC;
   2766     }
   2767 }
   2768 
   2769 static void
   2770 do_vec_INS (sim_cpu *cpu)
   2771 {
   2772   /* instr[31,21] = 01001110000
   2773      instr[20,16] = element size and index
   2774      instr[15,10] = 000111
   2775      instr[9,5]   = W source
   2776      instr[4,0]   = V dest  */
   2777 
   2778   int index;
   2779   unsigned rs = INSTR (9, 5);
   2780   unsigned vd = INSTR (4, 0);
   2781 
   2782   NYI_assert (31, 21, 0x270);
   2783   NYI_assert (15, 10, 0x07);
   2784 
   2785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2786   if (INSTR (16, 16))
   2787     {
   2788       index = INSTR (20, 17);
   2789       aarch64_set_vec_u8 (cpu, vd, index,
   2790 			  aarch64_get_reg_u8 (cpu, rs, NO_SP));
   2791     }
   2792   else if (INSTR (17, 17))
   2793     {
   2794       index = INSTR (20, 18);
   2795       aarch64_set_vec_u16 (cpu, vd, index,
   2796 			   aarch64_get_reg_u16 (cpu, rs, NO_SP));
   2797     }
   2798   else if (INSTR (18, 18))
   2799     {
   2800       index = INSTR (20, 19);
   2801       aarch64_set_vec_u32 (cpu, vd, index,
   2802 			   aarch64_get_reg_u32 (cpu, rs, NO_SP));
   2803     }
   2804   else if (INSTR (19, 19))
   2805     {
   2806       index = INSTR (20, 20);
   2807       aarch64_set_vec_u64 (cpu, vd, index,
   2808 			   aarch64_get_reg_u64 (cpu, rs, NO_SP));
   2809     }
   2810   else
   2811     HALT_NYI;
   2812 }
   2813 
   2814 static void
   2815 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
   2816 {
   2817   /* instr[31]    = 0
   2818      instr[30]    = half(0)/full(1)
   2819      instr[29,21] = 00 1110 000
   2820      instr[20,16] = element size and index
   2821      instr[15,10] = 0000 01
   2822      instr[9,5]   = V source
   2823      instr[4,0]   = V dest.  */
   2824 
   2825   unsigned full = INSTR (30, 30);
   2826   unsigned vs = INSTR (9, 5);
   2827   unsigned vd = INSTR (4, 0);
   2828   int i, index;
   2829 
   2830   NYI_assert (29, 21, 0x070);
   2831   NYI_assert (15, 10, 0x01);
   2832 
   2833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2834   if (INSTR (16, 16))
   2835     {
   2836       index = INSTR (20, 17);
   2837 
   2838       for (i = 0; i < (full ? 16 : 8); i++)
   2839 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
   2840     }
   2841   else if (INSTR (17, 17))
   2842     {
   2843       index = INSTR (20, 18);
   2844 
   2845       for (i = 0; i < (full ? 8 : 4); i++)
   2846 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
   2847     }
   2848   else if (INSTR (18, 18))
   2849     {
   2850       index = INSTR (20, 19);
   2851 
   2852       for (i = 0; i < (full ? 4 : 2); i++)
   2853 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
   2854     }
   2855   else
   2856     {
   2857       if (INSTR (19, 19) == 0)
   2858 	HALT_UNALLOC;
   2859 
   2860       if (! full)
   2861 	HALT_UNALLOC;
   2862 
   2863       index = INSTR (20, 20);
   2864 
   2865       for (i = 0; i < 2; i++)
   2866 	aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
   2867     }
   2868 }
   2869 
   2870 static void
   2871 do_vec_TBL (sim_cpu *cpu)
   2872 {
   2873   /* instr[31]    = 0
   2874      instr[30]    = half(0)/full(1)
   2875      instr[29,21] = 00 1110 000
   2876      instr[20,16] = Vm
   2877      instr[15]    = 0
   2878      instr[14,13] = vec length
   2879      instr[12,10] = 000
   2880      instr[9,5]   = V start
   2881      instr[4,0]   = V dest  */
   2882 
   2883   int full    = INSTR (30, 30);
   2884   int len     = INSTR (14, 13) + 1;
   2885   unsigned vm = INSTR (20, 16);
   2886   unsigned vn = INSTR (9, 5);
   2887   unsigned vd = INSTR (4, 0);
   2888   unsigned i;
   2889 
   2890   NYI_assert (29, 21, 0x070);
   2891   NYI_assert (12, 10, 0);
   2892 
   2893   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2894   for (i = 0; i < (full ? 16 : 8); i++)
   2895     {
   2896       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
   2897       uint8_t val;
   2898 
   2899       if (selector < 16)
   2900 	val = aarch64_get_vec_u8 (cpu, vn, selector);
   2901       else if (selector < 32)
   2902 	val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
   2903       else if (selector < 48)
   2904 	val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
   2905       else if (selector < 64)
   2906 	val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
   2907       else
   2908 	val = 0;
   2909 
   2910       aarch64_set_vec_u8 (cpu, vd, i, val);
   2911     }
   2912 }
   2913 
   2914 static void
   2915 do_vec_TRN (sim_cpu *cpu)
   2916 {
   2917   /* instr[31]    = 0
   2918      instr[30]    = half(0)/full(1)
   2919      instr[29,24] = 00 1110
   2920      instr[23,22] = size
   2921      instr[21]    = 0
   2922      instr[20,16] = Vm
   2923      instr[15]    = 0
   2924      instr[14]    = TRN1 (0) / TRN2 (1)
   2925      instr[13,10] = 1010
   2926      instr[9,5]   = V source
   2927      instr[4,0]   = V dest.  */
   2928 
   2929   int full    = INSTR (30, 30);
   2930   int second  = INSTR (14, 14);
   2931   unsigned vm = INSTR (20, 16);
   2932   unsigned vn = INSTR (9, 5);
   2933   unsigned vd = INSTR (4, 0);
   2934   unsigned i;
   2935 
   2936   NYI_assert (29, 24, 0x0E);
   2937   NYI_assert (13, 10, 0xA);
   2938 
   2939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   2940   switch (INSTR (23, 22))
   2941     {
   2942     case 0:
   2943       for (i = 0; i < (full ? 8 : 4); i++)
   2944 	{
   2945 	  aarch64_set_vec_u8
   2946 	    (cpu, vd, i * 2,
   2947 	     aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
   2948 	  aarch64_set_vec_u8
   2949 	    (cpu, vd, 1 * 2 + 1,
   2950 	     aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
   2951 	}
   2952       break;
   2953 
   2954     case 1:
   2955       for (i = 0; i < (full ? 4 : 2); i++)
   2956 	{
   2957 	  aarch64_set_vec_u16
   2958 	    (cpu, vd, i * 2,
   2959 	     aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
   2960 	  aarch64_set_vec_u16
   2961 	    (cpu, vd, 1 * 2 + 1,
   2962 	     aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
   2963 	}
   2964       break;
   2965 
   2966     case 2:
   2967       aarch64_set_vec_u32
   2968 	(cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
   2969       aarch64_set_vec_u32
   2970 	(cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
   2971       aarch64_set_vec_u32
   2972 	(cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
   2973       aarch64_set_vec_u32
   2974 	(cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
   2975       break;
   2976 
   2977     case 3:
   2978       if (! full)
   2979 	HALT_UNALLOC;
   2980 
   2981       aarch64_set_vec_u64 (cpu, vd, 0,
   2982 			   aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
   2983       aarch64_set_vec_u64 (cpu, vd, 1,
   2984 			   aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
   2985       break;
   2986     }
   2987 }
   2988 
   2989 static void
   2990 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
   2991 {
   2992   /* instr[31]    = 0
   2993      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
   2994                     [must be 1 for 64-bit xfer]
   2995      instr[29,20] = 00 1110 0000
   2996      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
   2997                                   0100=> 32-bits. 1000=>64-bits
   2998      instr[15,10] = 0000 11
   2999      instr[9,5]   = W source
   3000      instr[4,0]   = V dest.  */
   3001 
   3002   unsigned i;
   3003   unsigned Vd = INSTR (4, 0);
   3004   unsigned Rs = INSTR (9, 5);
   3005   int both    = INSTR (30, 30);
   3006 
   3007   NYI_assert (29, 20, 0x0E0);
   3008   NYI_assert (15, 10, 0x03);
   3009 
   3010   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3011   switch (INSTR (19, 16))
   3012     {
   3013     case 1:
   3014       for (i = 0; i < (both ? 16 : 8); i++)
   3015 	aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
   3016       break;
   3017 
   3018     case 2:
   3019       for (i = 0; i < (both ? 8 : 4); i++)
   3020 	aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
   3021       break;
   3022 
   3023     case 4:
   3024       for (i = 0; i < (both ? 4 : 2); i++)
   3025 	aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
   3026       break;
   3027 
   3028     case 8:
   3029       if (!both)
   3030 	HALT_NYI;
   3031       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
   3032       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
   3033       break;
   3034 
   3035     default:
   3036       HALT_NYI;
   3037     }
   3038 }
   3039 
   3040 static void
   3041 do_vec_UZP (sim_cpu *cpu)
   3042 {
   3043   /* instr[31]    = 0
   3044      instr[30]    = half(0)/full(1)
   3045      instr[29,24] = 00 1110
   3046      instr[23,22] = size: byte(00), half(01), word (10), long (11)
   3047      instr[21]    = 0
   3048      instr[20,16] = Vm
   3049      instr[15]    = 0
   3050      instr[14]    = lower (0) / upper (1)
   3051      instr[13,10] = 0110
   3052      instr[9,5]   = Vn
   3053      instr[4,0]   = Vd.  */
   3054 
   3055   int full = INSTR (30, 30);
   3056   int upper = INSTR (14, 14);
   3057 
   3058   unsigned vm = INSTR (20, 16);
   3059   unsigned vn = INSTR (9, 5);
   3060   unsigned vd = INSTR (4, 0);
   3061 
   3062   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
   3063   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
   3064   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
   3065   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
   3066 
   3067   uint64_t val1;
   3068   uint64_t val2;
   3069 
   3070   uint64_t input2 = full ? val_n2 : val_m1;
   3071 
   3072   NYI_assert (29, 24, 0x0E);
   3073   NYI_assert (21, 21, 0);
   3074   NYI_assert (15, 15, 0);
   3075   NYI_assert (13, 10, 6);
   3076 
   3077   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3078   switch (INSTR (23, 22))
   3079     {
   3080     case 0:
   3081       val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
   3082       val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
   3083       val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
   3084       val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
   3085 
   3086       val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
   3087       val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
   3088       val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
   3089       val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
   3090 
   3091       if (full)
   3092 	{
   3093 	  val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
   3094 	  val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
   3095 	  val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
   3096 	  val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
   3097 
   3098 	  val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
   3099 	  val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
   3100 	  val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
   3101 	  val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
   3102 	}
   3103       break;
   3104 
   3105     case 1:
   3106       val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
   3107       val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
   3108 
   3109       val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
   3110       val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
   3111 
   3112       if (full)
   3113 	{
   3114 	  val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
   3115 	  val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
   3116 
   3117 	  val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
   3118 	  val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
   3119 	}
   3120       break;
   3121 
   3122     case 2:
   3123       val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
   3124       val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
   3125 
   3126       if (full)
   3127 	{
   3128 	  val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
   3129 	  val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
   3130 	}
   3131       break;
   3132 
   3133     case 3:
   3134       if (! full)
   3135 	HALT_UNALLOC;
   3136 
   3137       val1 = upper ? val_n2 : val_n1;
   3138       val2 = upper ? val_m2 : val_m1;
   3139       break;
   3140     }
   3141 
   3142   aarch64_set_vec_u64 (cpu, vd, 0, val1);
   3143   if (full)
   3144     aarch64_set_vec_u64 (cpu, vd, 1, val2);
   3145 }
   3146 
   3147 static void
   3148 do_vec_ZIP (sim_cpu *cpu)
   3149 {
   3150   /* instr[31]    = 0
   3151      instr[30]    = half(0)/full(1)
   3152      instr[29,24] = 00 1110
   3153      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
   3154      instr[21]    = 0
   3155      instr[20,16] = Vm
   3156      instr[15]    = 0
   3157      instr[14]    = lower (0) / upper (1)
   3158      instr[13,10] = 1110
   3159      instr[9,5]   = Vn
   3160      instr[4,0]   = Vd.  */
   3161 
   3162   int full = INSTR (30, 30);
   3163   int upper = INSTR (14, 14);
   3164 
   3165   unsigned vm = INSTR (20, 16);
   3166   unsigned vn = INSTR (9, 5);
   3167   unsigned vd = INSTR (4, 0);
   3168 
   3169   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
   3170   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
   3171   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
   3172   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
   3173 
   3174   uint64_t val1 = 0;
   3175   uint64_t val2 = 0;
   3176 
   3177   uint64_t input1 = upper ? val_n1 : val_m1;
   3178   uint64_t input2 = upper ? val_n2 : val_m2;
   3179 
   3180   NYI_assert (29, 24, 0x0E);
   3181   NYI_assert (21, 21, 0);
   3182   NYI_assert (15, 15, 0);
   3183   NYI_assert (13, 10, 0xE);
   3184 
   3185   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3186   switch (INSTR (23, 23))
   3187     {
   3188     case 0:
   3189       val1 =
   3190 	  ((input1 <<  0) & (0xFF    <<  0))
   3191 	| ((input2 <<  8) & (0xFF    <<  8))
   3192 	| ((input1 <<  8) & (0xFF    << 16))
   3193 	| ((input2 << 16) & (0xFF    << 24))
   3194 	| ((input1 << 16) & (0xFFULL << 32))
   3195 	| ((input2 << 24) & (0xFFULL << 40))
   3196 	| ((input1 << 24) & (0xFFULL << 48))
   3197 	| ((input2 << 32) & (0xFFULL << 56));
   3198 
   3199       val2 =
   3200 	  ((input1 >> 32) & (0xFF    <<  0))
   3201 	| ((input2 >> 24) & (0xFF    <<  8))
   3202 	| ((input1 >> 24) & (0xFF    << 16))
   3203 	| ((input2 >> 16) & (0xFF    << 24))
   3204 	| ((input1 >> 16) & (0xFFULL << 32))
   3205 	| ((input2 >>  8) & (0xFFULL << 40))
   3206 	| ((input1 >>  8) & (0xFFULL << 48))
   3207 	| ((input2 >>  0) & (0xFFULL << 56));
   3208       break;
   3209 
   3210     case 1:
   3211       val1 =
   3212 	  ((input1 <<  0) & (0xFFFF    <<  0))
   3213 	| ((input2 << 16) & (0xFFFF    << 16))
   3214 	| ((input1 << 16) & (0xFFFFULL << 32))
   3215 	| ((input2 << 32) & (0xFFFFULL << 48));
   3216 
   3217       val2 =
   3218 	  ((input1 >> 32) & (0xFFFF    <<  0))
   3219 	| ((input2 >> 16) & (0xFFFF    << 16))
   3220 	| ((input1 >> 16) & (0xFFFFULL << 32))
   3221 	| ((input2 >>  0) & (0xFFFFULL << 48));
   3222       break;
   3223 
   3224     case 2:
   3225       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
   3226       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
   3227       break;
   3228 
   3229     case 3:
   3230       val1 = input1;
   3231       val2 = input2;
   3232       break;
   3233     }
   3234 
   3235   aarch64_set_vec_u64 (cpu, vd, 0, val1);
   3236   if (full)
   3237     aarch64_set_vec_u64 (cpu, vd, 1, val2);
   3238 }
   3239 
   3240 /* Floating point immediates are encoded in 8 bits.
   3241    fpimm[7] = sign bit.
   3242    fpimm[6:4] = signed exponent.
   3243    fpimm[3:0] = fraction (assuming leading 1).
   3244    i.e. F = s * 1.f * 2^(e - b).  */
   3245 
   3246 static float
   3247 fp_immediate_for_encoding_32 (uint32_t imm8)
   3248 {
   3249   float u;
   3250   uint32_t s, e, f, i;
   3251 
   3252   s = (imm8 >> 7) & 0x1;
   3253   e = (imm8 >> 4) & 0x7;
   3254   f = imm8 & 0xf;
   3255 
   3256   /* The fp value is s * n/16 * 2r where n is 16+e.  */
   3257   u = (16.0 + f) / 16.0;
   3258 
   3259   /* N.B. exponent is signed.  */
   3260   if (e < 4)
   3261     {
   3262       int epos = e;
   3263 
   3264       for (i = 0; i <= epos; i++)
   3265 	u *= 2.0;
   3266     }
   3267   else
   3268     {
   3269       int eneg = 7 - e;
   3270 
   3271       for (i = 0; i < eneg; i++)
   3272 	u /= 2.0;
   3273     }
   3274 
   3275   if (s)
   3276     u = - u;
   3277 
   3278   return u;
   3279 }
   3280 
   3281 static double
   3282 fp_immediate_for_encoding_64 (uint32_t imm8)
   3283 {
   3284   double u;
   3285   uint32_t s, e, f, i;
   3286 
   3287   s = (imm8 >> 7) & 0x1;
   3288   e = (imm8 >> 4) & 0x7;
   3289   f = imm8 & 0xf;
   3290 
   3291   /* The fp value is s * n/16 * 2r where n is 16+e.  */
   3292   u = (16.0 + f) / 16.0;
   3293 
   3294   /* N.B. exponent is signed.  */
   3295   if (e < 4)
   3296     {
   3297       int epos = e;
   3298 
   3299       for (i = 0; i <= epos; i++)
   3300 	u *= 2.0;
   3301     }
   3302   else
   3303     {
   3304       int eneg = 7 - e;
   3305 
   3306       for (i = 0; i < eneg; i++)
   3307 	u /= 2.0;
   3308     }
   3309 
   3310   if (s)
   3311     u = - u;
   3312 
   3313   return u;
   3314 }
   3315 
   3316 static void
   3317 do_vec_MOV_immediate (sim_cpu *cpu)
   3318 {
   3319   /* instr[31]    = 0
   3320      instr[30]    = full/half selector
   3321      instr[29,19] = 00111100000
   3322      instr[18,16] = high 3 bits of uimm8
   3323      instr[15,12] = size & shift:
   3324                                   0000 => 32-bit
   3325                                   0010 => 32-bit + LSL#8
   3326                                   0100 => 32-bit + LSL#16
   3327                                   0110 => 32-bit + LSL#24
   3328                                   1010 => 16-bit + LSL#8
   3329                                   1000 => 16-bit
   3330                                   1101 => 32-bit + MSL#16
   3331                                   1100 => 32-bit + MSL#8
   3332                                   1110 => 8-bit
   3333                                   1111 => double
   3334      instr[11,10] = 01
   3335      instr[9,5]   = low 5-bits of uimm8
   3336      instr[4,0]   = Vd.  */
   3337 
   3338   int full     = INSTR (30, 30);
   3339   unsigned vd  = INSTR (4, 0);
   3340   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
   3341   unsigned i;
   3342 
   3343   NYI_assert (29, 19, 0x1E0);
   3344   NYI_assert (11, 10, 1);
   3345 
   3346   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3347   switch (INSTR (15, 12))
   3348     {
   3349     case 0x0: /* 32-bit, no shift.  */
   3350     case 0x2: /* 32-bit, shift by 8.  */
   3351     case 0x4: /* 32-bit, shift by 16.  */
   3352     case 0x6: /* 32-bit, shift by 24.  */
   3353       val <<= (8 * INSTR (14, 13));
   3354       for (i = 0; i < (full ? 4 : 2); i++)
   3355 	aarch64_set_vec_u32 (cpu, vd, i, val);
   3356       break;
   3357 
   3358     case 0xa: /* 16-bit, shift by 8.  */
   3359       val <<= 8;
   3360       /* Fall through.  */
   3361     case 0x8: /* 16-bit, no shift.  */
   3362       for (i = 0; i < (full ? 8 : 4); i++)
   3363 	aarch64_set_vec_u16 (cpu, vd, i, val);
   3364       break;
   3365 
   3366     case 0xd: /* 32-bit, mask shift by 16.  */
   3367       val <<= 8;
   3368       val |= 0xFF;
   3369       /* Fall through.  */
   3370     case 0xc: /* 32-bit, mask shift by 8. */
   3371       val <<= 8;
   3372       val |= 0xFF;
   3373       for (i = 0; i < (full ? 4 : 2); i++)
   3374 	aarch64_set_vec_u32 (cpu, vd, i, val);
   3375       break;
   3376 
   3377     case 0xe: /* 8-bit, no shift.  */
   3378       for (i = 0; i < (full ? 16 : 8); i++)
   3379 	aarch64_set_vec_u8 (cpu, vd, i, val);
   3380       break;
   3381 
   3382     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
   3383       {
   3384 	float u = fp_immediate_for_encoding_32 (val);
   3385 	for (i = 0; i < (full ? 4 : 2); i++)
   3386 	  aarch64_set_vec_float (cpu, vd, i, u);
   3387 	break;
   3388       }
   3389 
   3390     default:
   3391       HALT_NYI;
   3392     }
   3393 }
   3394 
   3395 static void
   3396 do_vec_MVNI (sim_cpu *cpu)
   3397 {
   3398   /* instr[31]    = 0
   3399      instr[30]    = full/half selector
   3400      instr[29,19] = 10111100000
   3401      instr[18,16] = high 3 bits of uimm8
   3402      instr[15,12] = selector
   3403      instr[11,10] = 01
   3404      instr[9,5]   = low 5-bits of uimm8
   3405      instr[4,0]   = Vd.  */
   3406 
   3407   int full     = INSTR (30, 30);
   3408   unsigned vd  = INSTR (4, 0);
   3409   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
   3410   unsigned i;
   3411 
   3412   NYI_assert (29, 19, 0x5E0);
   3413   NYI_assert (11, 10, 1);
   3414 
   3415   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3416   switch (INSTR (15, 12))
   3417     {
   3418     case 0x0: /* 32-bit, no shift.  */
   3419     case 0x2: /* 32-bit, shift by 8.  */
   3420     case 0x4: /* 32-bit, shift by 16.  */
   3421     case 0x6: /* 32-bit, shift by 24.  */
   3422       val <<= (8 * INSTR (14, 13));
   3423       val = ~ val;
   3424       for (i = 0; i < (full ? 4 : 2); i++)
   3425 	aarch64_set_vec_u32 (cpu, vd, i, val);
   3426       return;
   3427 
   3428     case 0xa: /* 16-bit, 8 bit shift. */
   3429       val <<= 8;
   3430     case 0x8: /* 16-bit, no shift. */
   3431       val = ~ val;
   3432       for (i = 0; i < (full ? 8 : 4); i++)
   3433 	aarch64_set_vec_u16 (cpu, vd, i, val);
   3434       return;
   3435 
   3436     case 0xd: /* 32-bit, mask shift by 16.  */
   3437       val <<= 8;
   3438       val |= 0xFF;
   3439     case 0xc: /* 32-bit, mask shift by 8. */
   3440       val <<= 8;
   3441       val |= 0xFF;
   3442       val = ~ val;
   3443       for (i = 0; i < (full ? 4 : 2); i++)
   3444 	aarch64_set_vec_u32 (cpu, vd, i, val);
   3445       return;
   3446 
   3447     case 0xE: /* MOVI Dn, #mask64 */
   3448       {
   3449 	uint64_t mask = 0;
   3450 
   3451 	for (i = 0; i < 8; i++)
   3452 	  if (val & (1 << i))
   3453 	    mask |= (0xFFUL << (i * 8));
   3454 	aarch64_set_vec_u64 (cpu, vd, 0, mask);
   3455 	aarch64_set_vec_u64 (cpu, vd, 1, mask);
   3456 	return;
   3457       }
   3458 
   3459     case 0xf: /* FMOV Vd.2D, #fpimm.  */
   3460       {
   3461 	double u = fp_immediate_for_encoding_64 (val);
   3462 
   3463 	if (! full)
   3464 	  HALT_UNALLOC;
   3465 
   3466 	aarch64_set_vec_double (cpu, vd, 0, u);
   3467 	aarch64_set_vec_double (cpu, vd, 1, u);
   3468 	return;
   3469       }
   3470 
   3471     default:
   3472       HALT_NYI;
   3473     }
   3474 }
   3475 
   3476 #define ABS(A) ((A) < 0 ? - (A) : (A))
   3477 
   3478 static void
   3479 do_vec_ABS (sim_cpu *cpu)
   3480 {
   3481   /* instr[31]    = 0
   3482      instr[30]    = half(0)/full(1)
   3483      instr[29,24] = 00 1110
   3484      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
   3485      instr[21,10] = 10 0000 1011 10
   3486      instr[9,5]   = Vn
   3487      instr[4.0]   = Vd.  */
   3488 
   3489   unsigned vn = INSTR (9, 5);
   3490   unsigned vd = INSTR (4, 0);
   3491   unsigned full = INSTR (30, 30);
   3492   unsigned i;
   3493 
   3494   NYI_assert (29, 24, 0x0E);
   3495   NYI_assert (21, 10, 0x82E);
   3496 
   3497   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3498   switch (INSTR (23, 22))
   3499     {
   3500     case 0:
   3501       for (i = 0; i < (full ? 16 : 8); i++)
   3502 	aarch64_set_vec_s8 (cpu, vd, i,
   3503 			    ABS (aarch64_get_vec_s8 (cpu, vn, i)));
   3504       break;
   3505 
   3506     case 1:
   3507       for (i = 0; i < (full ? 8 : 4); i++)
   3508 	aarch64_set_vec_s16 (cpu, vd, i,
   3509 			     ABS (aarch64_get_vec_s16 (cpu, vn, i)));
   3510       break;
   3511 
   3512     case 2:
   3513       for (i = 0; i < (full ? 4 : 2); i++)
   3514 	aarch64_set_vec_s32 (cpu, vd, i,
   3515 			     ABS (aarch64_get_vec_s32 (cpu, vn, i)));
   3516       break;
   3517 
   3518     case 3:
   3519       if (! full)
   3520 	HALT_NYI;
   3521       for (i = 0; i < 2; i++)
   3522 	aarch64_set_vec_s64 (cpu, vd, i,
   3523 			     ABS (aarch64_get_vec_s64 (cpu, vn, i)));
   3524       break;
   3525     }
   3526 }
   3527 
   3528 static void
   3529 do_vec_ADDV (sim_cpu *cpu)
   3530 {
   3531   /* instr[31]    = 0
   3532      instr[30]    = full/half selector
   3533      instr[29,24] = 00 1110
   3534      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
   3535      instr[21,10] = 11 0001 1011 10
   3536      instr[9,5]   = Vm
   3537      instr[4.0]   = Rd.  */
   3538 
   3539   unsigned vm = INSTR (9, 5);
   3540   unsigned rd = INSTR (4, 0);
   3541   unsigned i;
   3542   int      full = INSTR (30, 30);
   3543 
   3544   NYI_assert (29, 24, 0x0E);
   3545   NYI_assert (21, 10, 0xC6E);
   3546 
   3547   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3548   switch (INSTR (23, 22))
   3549     {
   3550     case 0:
   3551       {
   3552 	uint8_t val = 0;
   3553 	for (i = 0; i < (full ? 16 : 8); i++)
   3554 	  val += aarch64_get_vec_u8 (cpu, vm, i);
   3555 	aarch64_set_vec_u64 (cpu, rd, 0, val);
   3556 	return;
   3557       }
   3558 
   3559     case 1:
   3560       {
   3561 	uint16_t val = 0;
   3562 	for (i = 0; i < (full ? 8 : 4); i++)
   3563 	  val += aarch64_get_vec_u16 (cpu, vm, i);
   3564 	aarch64_set_vec_u64 (cpu, rd, 0, val);
   3565 	return;
   3566       }
   3567 
   3568     case 2:
   3569       {
   3570 	uint32_t val = 0;
   3571 	if (! full)
   3572 	  HALT_UNALLOC;
   3573 	for (i = 0; i < 4; i++)
   3574 	  val += aarch64_get_vec_u32 (cpu, vm, i);
   3575 	aarch64_set_vec_u64 (cpu, rd, 0, val);
   3576 	return;
   3577       }
   3578 
   3579     case 3:
   3580       HALT_UNALLOC;
   3581     }
   3582 }
   3583 
   3584 static void
   3585 do_vec_ins_2 (sim_cpu *cpu)
   3586 {
   3587   /* instr[31,21] = 01001110000
   3588      instr[20,18] = size & element selector
   3589      instr[17,14] = 0000
   3590      instr[13]    = direction: to vec(0), from vec (1)
   3591      instr[12,10] = 111
   3592      instr[9,5]   = Vm
   3593      instr[4,0]   = Vd.  */
   3594 
   3595   unsigned elem;
   3596   unsigned vm = INSTR (9, 5);
   3597   unsigned vd = INSTR (4, 0);
   3598 
   3599   NYI_assert (31, 21, 0x270);
   3600   NYI_assert (17, 14, 0);
   3601   NYI_assert (12, 10, 7);
   3602 
   3603   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3604   if (INSTR (13, 13) == 1)
   3605     {
   3606       if (INSTR (18, 18) == 1)
   3607 	{
   3608 	  /* 32-bit moves.  */
   3609 	  elem = INSTR (20, 19);
   3610 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
   3611 			       aarch64_get_vec_u32 (cpu, vm, elem));
   3612 	}
   3613       else
   3614 	{
   3615 	  /* 64-bit moves.  */
   3616 	  if (INSTR (19, 19) != 1)
   3617 	    HALT_NYI;
   3618 
   3619 	  elem = INSTR (20, 20);
   3620 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
   3621 			       aarch64_get_vec_u64 (cpu, vm, elem));
   3622 	}
   3623     }
   3624   else
   3625     {
   3626       if (INSTR (18, 18) == 1)
   3627 	{
   3628 	  /* 32-bit moves.  */
   3629 	  elem = INSTR (20, 19);
   3630 	  aarch64_set_vec_u32 (cpu, vd, elem,
   3631 			       aarch64_get_reg_u32 (cpu, vm, NO_SP));
   3632 	}
   3633       else
   3634 	{
   3635 	  /* 64-bit moves.  */
   3636 	  if (INSTR (19, 19) != 1)
   3637 	    HALT_NYI;
   3638 
   3639 	  elem = INSTR (20, 20);
   3640 	  aarch64_set_vec_u64 (cpu, vd, elem,
   3641 			       aarch64_get_reg_u64 (cpu, vm, NO_SP));
   3642 	}
   3643     }
   3644 }
   3645 
   3646 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)	  \
   3647   do								  \
   3648     {								  \
   3649       DST_TYPE a[N], b[N];					  \
   3650 								  \
   3651       for (i = 0; i < (N); i++)					  \
   3652 	{							  \
   3653 	  a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
   3654 	  b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
   3655 	}							  \
   3656       for (i = 0; i < (N); i++)					  \
   3657 	aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);	  \
   3658     }								  \
   3659   while (0)
   3660 
   3661 static void
   3662 do_vec_mull (sim_cpu *cpu)
   3663 {
   3664   /* instr[31]    = 0
   3665      instr[30]    = lower(0)/upper(1) selector
   3666      instr[29]    = signed(0)/unsigned(1)
   3667      instr[28,24] = 0 1110
   3668      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
   3669      instr[21]    = 1
   3670      instr[20,16] = Vm
   3671      instr[15,10] = 11 0000
   3672      instr[9,5]   = Vn
   3673      instr[4.0]   = Vd.  */
   3674 
   3675   int    unsign = INSTR (29, 29);
   3676   int    bias = INSTR (30, 30);
   3677   unsigned vm = INSTR (20, 16);
   3678   unsigned vn = INSTR ( 9,  5);
   3679   unsigned vd = INSTR ( 4,  0);
   3680   unsigned i;
   3681 
   3682   NYI_assert (28, 24, 0x0E);
   3683   NYI_assert (15, 10, 0x30);
   3684 
   3685   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3686   /* NB: Read source values before writing results, in case
   3687      the source and destination vectors are the same.  */
   3688   switch (INSTR (23, 22))
   3689     {
   3690     case 0:
   3691       if (bias)
   3692 	bias = 8;
   3693       if (unsign)
   3694 	DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
   3695       else
   3696 	DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
   3697       return;
   3698 
   3699     case 1:
   3700       if (bias)
   3701 	bias = 4;
   3702       if (unsign)
   3703 	DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
   3704       else
   3705 	DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
   3706       return;
   3707 
   3708     case 2:
   3709       if (bias)
   3710 	bias = 2;
   3711       if (unsign)
   3712 	DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
   3713       else
   3714 	DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
   3715       return;
   3716 
   3717     case 3:
   3718       HALT_NYI;
   3719     }
   3720 }
   3721 
   3722 static void
   3723 do_vec_fadd (sim_cpu *cpu)
   3724 {
   3725   /* instr[31]    = 0
   3726      instr[30]    = half(0)/full(1)
   3727      instr[29,24] = 001110
   3728      instr[23]    = FADD(0)/FSUB(1)
   3729      instr[22]    = float (0)/double(1)
   3730      instr[21]    = 1
   3731      instr[20,16] = Vm
   3732      instr[15,10] = 110101
   3733      instr[9,5]   = Vn
   3734      instr[4.0]   = Vd.  */
   3735 
   3736   unsigned vm = INSTR (20, 16);
   3737   unsigned vn = INSTR (9, 5);
   3738   unsigned vd = INSTR (4, 0);
   3739   unsigned i;
   3740   int      full = INSTR (30, 30);
   3741 
   3742   NYI_assert (29, 24, 0x0E);
   3743   NYI_assert (21, 21, 1);
   3744   NYI_assert (15, 10, 0x35);
   3745 
   3746   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3747   if (INSTR (23, 23))
   3748     {
   3749       if (INSTR (22, 22))
   3750 	{
   3751 	  if (! full)
   3752 	    HALT_NYI;
   3753 
   3754 	  for (i = 0; i < 2; i++)
   3755 	    aarch64_set_vec_double (cpu, vd, i,
   3756 				    aarch64_get_vec_double (cpu, vn, i)
   3757 				    - aarch64_get_vec_double (cpu, vm, i));
   3758 	}
   3759       else
   3760 	{
   3761 	  for (i = 0; i < (full ? 4 : 2); i++)
   3762 	    aarch64_set_vec_float (cpu, vd, i,
   3763 				   aarch64_get_vec_float (cpu, vn, i)
   3764 				   - aarch64_get_vec_float (cpu, vm, i));
   3765 	}
   3766     }
   3767   else
   3768     {
   3769       if (INSTR (22, 22))
   3770 	{
   3771 	  if (! full)
   3772 	    HALT_NYI;
   3773 
   3774 	  for (i = 0; i < 2; i++)
   3775 	    aarch64_set_vec_double (cpu, vd, i,
   3776 				    aarch64_get_vec_double (cpu, vm, i)
   3777 				    + aarch64_get_vec_double (cpu, vn, i));
   3778 	}
   3779       else
   3780 	{
   3781 	  for (i = 0; i < (full ? 4 : 2); i++)
   3782 	    aarch64_set_vec_float (cpu, vd, i,
   3783 				   aarch64_get_vec_float (cpu, vm, i)
   3784 				   + aarch64_get_vec_float (cpu, vn, i));
   3785 	}
   3786     }
   3787 }
   3788 
   3789 static void
   3790 do_vec_add (sim_cpu *cpu)
   3791 {
   3792   /* instr[31]    = 0
   3793      instr[30]    = full/half selector
   3794      instr[29,24] = 001110
   3795      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
   3796      instr[21]    = 1
   3797      instr[20,16] = Vn
   3798      instr[15,10] = 100001
   3799      instr[9,5]   = Vm
   3800      instr[4.0]   = Vd.  */
   3801 
   3802   unsigned vm = INSTR (20, 16);
   3803   unsigned vn = INSTR (9, 5);
   3804   unsigned vd = INSTR (4, 0);
   3805   unsigned i;
   3806   int      full = INSTR (30, 30);
   3807 
   3808   NYI_assert (29, 24, 0x0E);
   3809   NYI_assert (21, 21, 1);
   3810   NYI_assert (15, 10, 0x21);
   3811 
   3812   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3813   switch (INSTR (23, 22))
   3814     {
   3815     case 0:
   3816       for (i = 0; i < (full ? 16 : 8); i++)
   3817 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
   3818 			    + aarch64_get_vec_u8 (cpu, vm, i));
   3819       return;
   3820 
   3821     case 1:
   3822       for (i = 0; i < (full ? 8 : 4); i++)
   3823 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
   3824 			     + aarch64_get_vec_u16 (cpu, vm, i));
   3825       return;
   3826 
   3827     case 2:
   3828       for (i = 0; i < (full ? 4 : 2); i++)
   3829 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
   3830 			     + aarch64_get_vec_u32 (cpu, vm, i));
   3831       return;
   3832 
   3833     case 3:
   3834       if (! full)
   3835 	HALT_UNALLOC;
   3836       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
   3837 			   + aarch64_get_vec_u64 (cpu, vm, 0));
   3838       aarch64_set_vec_u64 (cpu, vd, 1,
   3839 			   aarch64_get_vec_u64 (cpu, vn, 1)
   3840 			   + aarch64_get_vec_u64 (cpu, vm, 1));
   3841       return;
   3842     }
   3843 }
   3844 
   3845 static void
   3846 do_vec_mul (sim_cpu *cpu)
   3847 {
   3848   /* instr[31]    = 0
   3849      instr[30]    = full/half selector
   3850      instr[29,24] = 00 1110
   3851      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
   3852      instr[21]    = 1
   3853      instr[20,16] = Vn
   3854      instr[15,10] = 10 0111
   3855      instr[9,5]   = Vm
   3856      instr[4.0]   = Vd.  */
   3857 
   3858   unsigned vm = INSTR (20, 16);
   3859   unsigned vn = INSTR (9, 5);
   3860   unsigned vd = INSTR (4, 0);
   3861   unsigned i;
   3862   int      full = INSTR (30, 30);
   3863   int      bias = 0;
   3864 
   3865   NYI_assert (29, 24, 0x0E);
   3866   NYI_assert (21, 21, 1);
   3867   NYI_assert (15, 10, 0x27);
   3868 
   3869   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3870   switch (INSTR (23, 22))
   3871     {
   3872     case 0:
   3873       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
   3874       return;
   3875 
   3876     case 1:
   3877       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
   3878       return;
   3879 
   3880     case 2:
   3881       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
   3882       return;
   3883 
   3884     case 3:
   3885       HALT_UNALLOC;
   3886     }
   3887 }
   3888 
   3889 static void
   3890 do_vec_MLA (sim_cpu *cpu)
   3891 {
   3892   /* instr[31]    = 0
   3893      instr[30]    = full/half selector
   3894      instr[29,24] = 00 1110
   3895      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
   3896      instr[21]    = 1
   3897      instr[20,16] = Vn
   3898      instr[15,10] = 1001 01
   3899      instr[9,5]   = Vm
   3900      instr[4.0]   = Vd.  */
   3901 
   3902   unsigned vm = INSTR (20, 16);
   3903   unsigned vn = INSTR (9, 5);
   3904   unsigned vd = INSTR (4, 0);
   3905   unsigned i;
   3906   int      full = INSTR (30, 30);
   3907 
   3908   NYI_assert (29, 24, 0x0E);
   3909   NYI_assert (21, 21, 1);
   3910   NYI_assert (15, 10, 0x25);
   3911 
   3912   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   3913   switch (INSTR (23, 22))
   3914     {
   3915     case 0:
   3916       for (i = 0; i < (full ? 16 : 8); i++)
   3917 	aarch64_set_vec_u8 (cpu, vd, i,
   3918 			    aarch64_get_vec_u8 (cpu, vd, i)
   3919 			    + (aarch64_get_vec_u8 (cpu, vn, i)
   3920 			       * aarch64_get_vec_u8 (cpu, vm, i)));
   3921       return;
   3922 
   3923     case 1:
   3924       for (i = 0; i < (full ? 8 : 4); i++)
   3925 	aarch64_set_vec_u16 (cpu, vd, i,
   3926 			     aarch64_get_vec_u16 (cpu, vd, i)
   3927 			     + (aarch64_get_vec_u16 (cpu, vn, i)
   3928 				* aarch64_get_vec_u16 (cpu, vm, i)));
   3929       return;
   3930 
   3931     case 2:
   3932       for (i = 0; i < (full ? 4 : 2); i++)
   3933 	aarch64_set_vec_u32 (cpu, vd, i,
   3934 			     aarch64_get_vec_u32 (cpu, vd, i)
   3935 			     + (aarch64_get_vec_u32 (cpu, vn, i)
   3936 				* aarch64_get_vec_u32 (cpu, vm, i)));
   3937       return;
   3938 
   3939     default:
   3940       HALT_UNALLOC;
   3941     }
   3942 }
   3943 
   3944 static float
   3945 fmaxnm (float a, float b)
   3946 {
   3947   if (! isnan (a))
   3948     {
   3949       if (! isnan (b))
   3950 	return a > b ? a : b;
   3951       return a;
   3952     }
   3953   else if (! isnan (b))
   3954     return b;
   3955   return a;
   3956 }
   3957 
   3958 static float
   3959 fminnm (float a, float b)
   3960 {
   3961   if (! isnan (a))
   3962     {
   3963       if (! isnan (b))
   3964 	return a < b ? a : b;
   3965       return a;
   3966     }
   3967   else if (! isnan (b))
   3968     return b;
   3969   return a;
   3970 }
   3971 
   3972 static double
   3973 dmaxnm (double a, double b)
   3974 {
   3975   if (! isnan (a))
   3976     {
   3977       if (! isnan (b))
   3978 	return a > b ? a : b;
   3979       return a;
   3980     }
   3981   else if (! isnan (b))
   3982     return b;
   3983   return a;
   3984 }
   3985 
   3986 static double
   3987 dminnm (double a, double b)
   3988 {
   3989   if (! isnan (a))
   3990     {
   3991       if (! isnan (b))
   3992 	return a < b ? a : b;
   3993       return a;
   3994     }
   3995   else if (! isnan (b))
   3996     return b;
   3997   return a;
   3998 }
   3999 
   4000 static void
   4001 do_vec_FminmaxNMP (sim_cpu *cpu)
   4002 {
   4003   /* instr [31]    = 0
   4004      instr [30]    = half (0)/full (1)
   4005      instr [29,24] = 10 1110
   4006      instr [23]    = max(0)/min(1)
   4007      instr [22]    = float (0)/double (1)
   4008      instr [21]    = 1
   4009      instr [20,16] = Vn
   4010      instr [15,10] = 1100 01
   4011      instr [9,5]   = Vm
   4012      instr [4.0]   = Vd.  */
   4013 
   4014   unsigned vm = INSTR (20, 16);
   4015   unsigned vn = INSTR (9, 5);
   4016   unsigned vd = INSTR (4, 0);
   4017   int      full = INSTR (30, 30);
   4018 
   4019   NYI_assert (29, 24, 0x2E);
   4020   NYI_assert (21, 21, 1);
   4021   NYI_assert (15, 10, 0x31);
   4022 
   4023   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4024   if (INSTR (22, 22))
   4025     {
   4026       double (* fn)(double, double) = INSTR (23, 23)
   4027 	? dminnm : dmaxnm;
   4028 
   4029       if (! full)
   4030 	HALT_NYI;
   4031       aarch64_set_vec_double (cpu, vd, 0,
   4032 			      fn (aarch64_get_vec_double (cpu, vn, 0),
   4033 				  aarch64_get_vec_double (cpu, vn, 1)));
   4034       aarch64_set_vec_double (cpu, vd, 0,
   4035 			      fn (aarch64_get_vec_double (cpu, vm, 0),
   4036 				  aarch64_get_vec_double (cpu, vm, 1)));
   4037     }
   4038   else
   4039     {
   4040       float (* fn)(float, float) = INSTR (23, 23)
   4041 	? fminnm : fmaxnm;
   4042 
   4043       aarch64_set_vec_float (cpu, vd, 0,
   4044 			     fn (aarch64_get_vec_float (cpu, vn, 0),
   4045 				 aarch64_get_vec_float (cpu, vn, 1)));
   4046       if (full)
   4047 	aarch64_set_vec_float (cpu, vd, 1,
   4048 			       fn (aarch64_get_vec_float (cpu, vn, 2),
   4049 				   aarch64_get_vec_float (cpu, vn, 3)));
   4050 
   4051       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
   4052 			     fn (aarch64_get_vec_float (cpu, vm, 0),
   4053 				 aarch64_get_vec_float (cpu, vm, 1)));
   4054       if (full)
   4055 	aarch64_set_vec_float (cpu, vd, 3,
   4056 			       fn (aarch64_get_vec_float (cpu, vm, 2),
   4057 				   aarch64_get_vec_float (cpu, vm, 3)));
   4058     }
   4059 }
   4060 
   4061 static void
   4062 do_vec_AND (sim_cpu *cpu)
   4063 {
   4064   /* instr[31]    = 0
   4065      instr[30]    = half (0)/full (1)
   4066      instr[29,21] = 001110001
   4067      instr[20,16] = Vm
   4068      instr[15,10] = 000111
   4069      instr[9,5]   = Vn
   4070      instr[4.0]   = Vd.  */
   4071 
   4072   unsigned vm = INSTR (20, 16);
   4073   unsigned vn = INSTR (9, 5);
   4074   unsigned vd = INSTR (4, 0);
   4075   unsigned i;
   4076   int      full = INSTR (30, 30);
   4077 
   4078   NYI_assert (29, 21, 0x071);
   4079   NYI_assert (15, 10, 0x07);
   4080 
   4081   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4082   for (i = 0; i < (full ? 4 : 2); i++)
   4083     aarch64_set_vec_u32 (cpu, vd, i,
   4084 			 aarch64_get_vec_u32 (cpu, vn, i)
   4085 			 & aarch64_get_vec_u32 (cpu, vm, i));
   4086 }
   4087 
   4088 static void
   4089 do_vec_BSL (sim_cpu *cpu)
   4090 {
   4091   /* instr[31]    = 0
   4092      instr[30]    = half (0)/full (1)
   4093      instr[29,21] = 101110011
   4094      instr[20,16] = Vm
   4095      instr[15,10] = 000111
   4096      instr[9,5]   = Vn
   4097      instr[4.0]   = Vd.  */
   4098 
   4099   unsigned vm = INSTR (20, 16);
   4100   unsigned vn = INSTR (9, 5);
   4101   unsigned vd = INSTR (4, 0);
   4102   unsigned i;
   4103   int      full = INSTR (30, 30);
   4104 
   4105   NYI_assert (29, 21, 0x173);
   4106   NYI_assert (15, 10, 0x07);
   4107 
   4108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4109   for (i = 0; i < (full ? 16 : 8); i++)
   4110     aarch64_set_vec_u8 (cpu, vd, i,
   4111 			(    aarch64_get_vec_u8 (cpu, vd, i)
   4112 			   & aarch64_get_vec_u8 (cpu, vn, i))
   4113 			| ((~ aarch64_get_vec_u8 (cpu, vd, i))
   4114 			   & aarch64_get_vec_u8 (cpu, vm, i)));
   4115 }
   4116 
   4117 static void
   4118 do_vec_EOR (sim_cpu *cpu)
   4119 {
   4120   /* instr[31]    = 0
   4121      instr[30]    = half (0)/full (1)
   4122      instr[29,21] = 10 1110 001
   4123      instr[20,16] = Vm
   4124      instr[15,10] = 000111
   4125      instr[9,5]   = Vn
   4126      instr[4.0]   = Vd.  */
   4127 
   4128   unsigned vm = INSTR (20, 16);
   4129   unsigned vn = INSTR (9, 5);
   4130   unsigned vd = INSTR (4, 0);
   4131   unsigned i;
   4132   int      full = INSTR (30, 30);
   4133 
   4134   NYI_assert (29, 21, 0x171);
   4135   NYI_assert (15, 10, 0x07);
   4136 
   4137   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4138   for (i = 0; i < (full ? 4 : 2); i++)
   4139     aarch64_set_vec_u32 (cpu, vd, i,
   4140 			 aarch64_get_vec_u32 (cpu, vn, i)
   4141 			 ^ aarch64_get_vec_u32 (cpu, vm, i));
   4142 }
   4143 
   4144 static void
   4145 do_vec_bit (sim_cpu *cpu)
   4146 {
   4147   /* instr[31]    = 0
   4148      instr[30]    = half (0)/full (1)
   4149      instr[29,23] = 10 1110 1
   4150      instr[22]    = BIT (0) / BIF (1)
   4151      instr[21]    = 1
   4152      instr[20,16] = Vm
   4153      instr[15,10] = 0001 11
   4154      instr[9,5]   = Vn
   4155      instr[4.0]   = Vd.  */
   4156 
   4157   unsigned vm = INSTR (20, 16);
   4158   unsigned vn = INSTR (9, 5);
   4159   unsigned vd = INSTR (4, 0);
   4160   unsigned full = INSTR (30, 30);
   4161   unsigned test_false = INSTR (22, 22);
   4162   unsigned i;
   4163 
   4164   NYI_assert (29, 23, 0x5D);
   4165   NYI_assert (21, 21, 1);
   4166   NYI_assert (15, 10, 0x07);
   4167 
   4168   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4169   for (i = 0; i < (full ? 4 : 2); i++)
   4170     {
   4171       uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
   4172       uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
   4173       uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
   4174       if (test_false)
   4175 	aarch64_set_vec_u32 (cpu, vd, i,
   4176 			     (vd_val & vm_val) | (vn_val & ~vm_val));
   4177       else
   4178 	aarch64_set_vec_u32 (cpu, vd, i,
   4179 			     (vd_val & ~vm_val) | (vn_val & vm_val));
   4180     }
   4181 }
   4182 
   4183 static void
   4184 do_vec_ORN (sim_cpu *cpu)
   4185 {
   4186   /* instr[31]    = 0
   4187      instr[30]    = half (0)/full (1)
   4188      instr[29,21] = 00 1110 111
   4189      instr[20,16] = Vm
   4190      instr[15,10] = 00 0111
   4191      instr[9,5]   = Vn
   4192      instr[4.0]   = Vd.  */
   4193 
   4194   unsigned vm = INSTR (20, 16);
   4195   unsigned vn = INSTR (9, 5);
   4196   unsigned vd = INSTR (4, 0);
   4197   unsigned i;
   4198   int      full = INSTR (30, 30);
   4199 
   4200   NYI_assert (29, 21, 0x077);
   4201   NYI_assert (15, 10, 0x07);
   4202 
   4203   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4204   for (i = 0; i < (full ? 16 : 8); i++)
   4205     aarch64_set_vec_u8 (cpu, vd, i,
   4206 			aarch64_get_vec_u8 (cpu, vn, i)
   4207 			| ~ aarch64_get_vec_u8 (cpu, vm, i));
   4208 }
   4209 
   4210 static void
   4211 do_vec_ORR (sim_cpu *cpu)
   4212 {
   4213   /* instr[31]    = 0
   4214      instr[30]    = half (0)/full (1)
   4215      instr[29,21] = 00 1110 101
   4216      instr[20,16] = Vm
   4217      instr[15,10] = 0001 11
   4218      instr[9,5]   = Vn
   4219      instr[4.0]   = Vd.  */
   4220 
   4221   unsigned vm = INSTR (20, 16);
   4222   unsigned vn = INSTR (9, 5);
   4223   unsigned vd = INSTR (4, 0);
   4224   unsigned i;
   4225   int      full = INSTR (30, 30);
   4226 
   4227   NYI_assert (29, 21, 0x075);
   4228   NYI_assert (15, 10, 0x07);
   4229 
   4230   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4231   for (i = 0; i < (full ? 16 : 8); i++)
   4232     aarch64_set_vec_u8 (cpu, vd, i,
   4233 			aarch64_get_vec_u8 (cpu, vn, i)
   4234 			| aarch64_get_vec_u8 (cpu, vm, i));
   4235 }
   4236 
   4237 static void
   4238 do_vec_BIC (sim_cpu *cpu)
   4239 {
   4240   /* instr[31]    = 0
   4241      instr[30]    = half (0)/full (1)
   4242      instr[29,21] = 00 1110 011
   4243      instr[20,16] = Vm
   4244      instr[15,10] = 00 0111
   4245      instr[9,5]   = Vn
   4246      instr[4.0]   = Vd.  */
   4247 
   4248   unsigned vm = INSTR (20, 16);
   4249   unsigned vn = INSTR (9, 5);
   4250   unsigned vd = INSTR (4, 0);
   4251   unsigned i;
   4252   int      full = INSTR (30, 30);
   4253 
   4254   NYI_assert (29, 21, 0x073);
   4255   NYI_assert (15, 10, 0x07);
   4256 
   4257   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4258   for (i = 0; i < (full ? 16 : 8); i++)
   4259     aarch64_set_vec_u8 (cpu, vd, i,
   4260 			aarch64_get_vec_u8 (cpu, vn, i)
   4261 			& ~ aarch64_get_vec_u8 (cpu, vm, i));
   4262 }
   4263 
   4264 static void
   4265 do_vec_XTN (sim_cpu *cpu)
   4266 {
   4267   /* instr[31]    = 0
   4268      instr[30]    = first part (0)/ second part (1)
   4269      instr[29,24] = 00 1110
   4270      instr[23,22] = size: byte(00), half(01), word (10)
   4271      instr[21,10] = 1000 0100 1010
   4272      instr[9,5]   = Vs
   4273      instr[4,0]   = Vd.  */
   4274 
   4275   unsigned vs = INSTR (9, 5);
   4276   unsigned vd = INSTR (4, 0);
   4277   unsigned bias = INSTR (30, 30);
   4278   unsigned i;
   4279 
   4280   NYI_assert (29, 24, 0x0E);
   4281   NYI_assert (21, 10, 0x84A);
   4282 
   4283   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4284   switch (INSTR (23, 22))
   4285     {
   4286     case 0:
   4287       for (i = 0; i < 8; i++)
   4288 	aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
   4289 			    aarch64_get_vec_u16 (cpu, vs, i));
   4290       return;
   4291 
   4292     case 1:
   4293       for (i = 0; i < 4; i++)
   4294 	aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
   4295 			     aarch64_get_vec_u32 (cpu, vs, i));
   4296       return;
   4297 
   4298     case 2:
   4299       for (i = 0; i < 2; i++)
   4300 	aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
   4301 			     aarch64_get_vec_u64 (cpu, vs, i));
   4302       return;
   4303     }
   4304 }
   4305 
   4306 /* Return the number of bits set in the input value.  */
   4307 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
   4308 # define popcount __builtin_popcount
   4309 #else
   4310 static int
   4311 popcount (unsigned char x)
   4312 {
   4313   static const unsigned char popcnt[16] =
   4314     {
   4315       0, 1, 1, 2,
   4316       1, 2, 2, 3,
   4317       1, 2, 2, 3,
   4318       2, 3, 3, 4
   4319     };
   4320 
   4321   /* Only counts the low 8 bits of the input as that is all we need.  */
   4322   return popcnt[x % 16] + popcnt[x / 16];
   4323 }
   4324 #endif
   4325 
   4326 static void
   4327 do_vec_CNT (sim_cpu *cpu)
   4328 {
   4329   /* instr[31]    = 0
   4330      instr[30]    = half (0)/ full (1)
   4331      instr[29,24] = 00 1110
   4332      instr[23,22] = size: byte(00)
   4333      instr[21,10] = 1000 0001 0110
   4334      instr[9,5]   = Vs
   4335      instr[4,0]   = Vd.  */
   4336 
   4337   unsigned vs = INSTR (9, 5);
   4338   unsigned vd = INSTR (4, 0);
   4339   int full = INSTR (30, 30);
   4340   int size = INSTR (23, 22);
   4341   int i;
   4342 
   4343   NYI_assert (29, 24, 0x0E);
   4344   NYI_assert (21, 10, 0x816);
   4345 
   4346   if (size != 0)
   4347     HALT_UNALLOC;
   4348 
   4349   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4350 
   4351   for (i = 0; i < (full ? 16 : 8); i++)
   4352     aarch64_set_vec_u8 (cpu, vd, i,
   4353 			popcount (aarch64_get_vec_u8 (cpu, vs, i)));
   4354 }
   4355 
   4356 static void
   4357 do_vec_maxv (sim_cpu *cpu)
   4358 {
   4359   /* instr[31]    = 0
   4360      instr[30]    = half(0)/full(1)
   4361      instr[29]    = signed (0)/unsigned(1)
   4362      instr[28,24] = 0 1110
   4363      instr[23,22] = size: byte(00), half(01), word (10)
   4364      instr[21]    = 1
   4365      instr[20,17] = 1 000
   4366      instr[16]    = max(0)/min(1)
   4367      instr[15,10] = 1010 10
   4368      instr[9,5]   = V source
   4369      instr[4.0]   = R dest.  */
   4370 
   4371   unsigned vs = INSTR (9, 5);
   4372   unsigned rd = INSTR (4, 0);
   4373   unsigned full = INSTR (30, 30);
   4374   unsigned i;
   4375 
   4376   NYI_assert (28, 24, 0x0E);
   4377   NYI_assert (21, 21, 1);
   4378   NYI_assert (20, 17, 8);
   4379   NYI_assert (15, 10, 0x2A);
   4380 
   4381   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4382   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
   4383     {
   4384     case 0: /* SMAXV.  */
   4385        {
   4386 	int64_t smax;
   4387 	switch (INSTR (23, 22))
   4388 	  {
   4389 	  case 0:
   4390 	    smax = aarch64_get_vec_s8 (cpu, vs, 0);
   4391 	    for (i = 1; i < (full ? 16 : 8); i++)
   4392 	      smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
   4393 	    break;
   4394 	  case 1:
   4395 	    smax = aarch64_get_vec_s16 (cpu, vs, 0);
   4396 	    for (i = 1; i < (full ? 8 : 4); i++)
   4397 	      smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
   4398 	    break;
   4399 	  case 2:
   4400 	    smax = aarch64_get_vec_s32 (cpu, vs, 0);
   4401 	    for (i = 1; i < (full ? 4 : 2); i++)
   4402 	      smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
   4403 	    break;
   4404 	  case 3:
   4405 	    HALT_UNALLOC;
   4406 	  }
   4407 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
   4408 	return;
   4409       }
   4410 
   4411     case 1: /* SMINV.  */
   4412       {
   4413 	int64_t smin;
   4414 	switch (INSTR (23, 22))
   4415 	  {
   4416 	  case 0:
   4417 	    smin = aarch64_get_vec_s8 (cpu, vs, 0);
   4418 	    for (i = 1; i < (full ? 16 : 8); i++)
   4419 	      smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
   4420 	    break;
   4421 	  case 1:
   4422 	    smin = aarch64_get_vec_s16 (cpu, vs, 0);
   4423 	    for (i = 1; i < (full ? 8 : 4); i++)
   4424 	      smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
   4425 	    break;
   4426 	  case 2:
   4427 	    smin = aarch64_get_vec_s32 (cpu, vs, 0);
   4428 	    for (i = 1; i < (full ? 4 : 2); i++)
   4429 	      smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
   4430 	    break;
   4431 
   4432 	  case 3:
   4433 	    HALT_UNALLOC;
   4434 	  }
   4435 	aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
   4436 	return;
   4437       }
   4438 
   4439     case 2: /* UMAXV.  */
   4440       {
   4441 	uint64_t umax;
   4442 	switch (INSTR (23, 22))
   4443 	  {
   4444 	  case 0:
   4445 	    umax = aarch64_get_vec_u8 (cpu, vs, 0);
   4446 	    for (i = 1; i < (full ? 16 : 8); i++)
   4447 	      umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
   4448 	    break;
   4449 	  case 1:
   4450 	    umax = aarch64_get_vec_u16 (cpu, vs, 0);
   4451 	    for (i = 1; i < (full ? 8 : 4); i++)
   4452 	      umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
   4453 	    break;
   4454 	  case 2:
   4455 	    umax = aarch64_get_vec_u32 (cpu, vs, 0);
   4456 	    for (i = 1; i < (full ? 4 : 2); i++)
   4457 	      umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
   4458 	    break;
   4459 
   4460 	  case 3:
   4461 	    HALT_UNALLOC;
   4462 	  }
   4463 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
   4464 	return;
   4465       }
   4466 
   4467     case 3: /* UMINV.  */
   4468       {
   4469 	uint64_t umin;
   4470 	switch (INSTR (23, 22))
   4471 	  {
   4472 	  case 0:
   4473 	    umin = aarch64_get_vec_u8 (cpu, vs, 0);
   4474 	    for (i = 1; i < (full ? 16 : 8); i++)
   4475 	      umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
   4476 	    break;
   4477 	  case 1:
   4478 	    umin = aarch64_get_vec_u16 (cpu, vs, 0);
   4479 	    for (i = 1; i < (full ? 8 : 4); i++)
   4480 	      umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
   4481 	    break;
   4482 	  case 2:
   4483 	    umin = aarch64_get_vec_u32 (cpu, vs, 0);
   4484 	    for (i = 1; i < (full ? 4 : 2); i++)
   4485 	      umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
   4486 	    break;
   4487 
   4488 	  case 3:
   4489 	    HALT_UNALLOC;
   4490 	  }
   4491 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
   4492 	return;
   4493       }
   4494     }
   4495 }
   4496 
   4497 static void
   4498 do_vec_fminmaxV (sim_cpu *cpu)
   4499 {
   4500   /* instr[31,24] = 0110 1110
   4501      instr[23]    = max(0)/min(1)
   4502      instr[22,14] = 011 0000 11
   4503      instr[13,12] = nm(00)/normal(11)
   4504      instr[11,10] = 10
   4505      instr[9,5]   = V source
   4506      instr[4.0]   = R dest.  */
   4507 
   4508   unsigned vs = INSTR (9, 5);
   4509   unsigned rd = INSTR (4, 0);
   4510   unsigned i;
   4511   float res   = aarch64_get_vec_float (cpu, vs, 0);
   4512 
   4513   NYI_assert (31, 24, 0x6E);
   4514   NYI_assert (22, 14, 0x0C3);
   4515   NYI_assert (11, 10, 2);
   4516 
   4517   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4518   if (INSTR (23, 23))
   4519     {
   4520       switch (INSTR (13, 12))
   4521 	{
   4522 	case 0: /* FMNINNMV.  */
   4523 	  for (i = 1; i < 4; i++)
   4524 	    res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
   4525 	  break;
   4526 
   4527 	case 3: /* FMINV.  */
   4528 	  for (i = 1; i < 4; i++)
   4529 	    res = min (res, aarch64_get_vec_float (cpu, vs, i));
   4530 	  break;
   4531 
   4532 	default:
   4533 	  HALT_NYI;
   4534 	}
   4535     }
   4536   else
   4537     {
   4538       switch (INSTR (13, 12))
   4539 	{
   4540 	case 0: /* FMNAXNMV.  */
   4541 	  for (i = 1; i < 4; i++)
   4542 	    res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
   4543 	  break;
   4544 
   4545 	case 3: /* FMAXV.  */
   4546 	  for (i = 1; i < 4; i++)
   4547 	    res = max (res, aarch64_get_vec_float (cpu, vs, i));
   4548 	  break;
   4549 
   4550 	default:
   4551 	  HALT_NYI;
   4552 	}
   4553     }
   4554 
   4555   aarch64_set_FP_float (cpu, rd, res);
   4556 }
   4557 
   4558 static void
   4559 do_vec_Fminmax (sim_cpu *cpu)
   4560 {
   4561   /* instr[31]    = 0
   4562      instr[30]    = half(0)/full(1)
   4563      instr[29,24] = 00 1110
   4564      instr[23]    = max(0)/min(1)
   4565      instr[22]    = float(0)/double(1)
   4566      instr[21]    = 1
   4567      instr[20,16] = Vm
   4568      instr[15,14] = 11
   4569      instr[13,12] = nm(00)/normal(11)
   4570      instr[11,10] = 01
   4571      instr[9,5]   = Vn
   4572      instr[4,0]   = Vd.  */
   4573 
   4574   unsigned vm = INSTR (20, 16);
   4575   unsigned vn = INSTR (9, 5);
   4576   unsigned vd = INSTR (4, 0);
   4577   unsigned full = INSTR (30, 30);
   4578   unsigned min = INSTR (23, 23);
   4579   unsigned i;
   4580 
   4581   NYI_assert (29, 24, 0x0E);
   4582   NYI_assert (21, 21, 1);
   4583   NYI_assert (15, 14, 3);
   4584   NYI_assert (11, 10, 1);
   4585 
   4586   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4587   if (INSTR (22, 22))
   4588     {
   4589       double (* func)(double, double);
   4590 
   4591       if (! full)
   4592 	HALT_NYI;
   4593 
   4594       if (INSTR (13, 12) == 0)
   4595 	func = min ? dminnm : dmaxnm;
   4596       else if (INSTR (13, 12) == 3)
   4597 	func = min ? fmin : fmax;
   4598       else
   4599 	HALT_NYI;
   4600 
   4601       for (i = 0; i < 2; i++)
   4602 	aarch64_set_vec_double (cpu, vd, i,
   4603 				func (aarch64_get_vec_double (cpu, vn, i),
   4604 				      aarch64_get_vec_double (cpu, vm, i)));
   4605     }
   4606   else
   4607     {
   4608       float (* func)(float, float);
   4609 
   4610       if (INSTR (13, 12) == 0)
   4611 	func = min ? fminnm : fmaxnm;
   4612       else if (INSTR (13, 12) == 3)
   4613 	func = min ? fminf : fmaxf;
   4614       else
   4615 	HALT_NYI;
   4616 
   4617       for (i = 0; i < (full ? 4 : 2); i++)
   4618 	aarch64_set_vec_float (cpu, vd, i,
   4619 			       func (aarch64_get_vec_float (cpu, vn, i),
   4620 				     aarch64_get_vec_float (cpu, vm, i)));
   4621     }
   4622 }
   4623 
   4624 static void
   4625 do_vec_SCVTF (sim_cpu *cpu)
   4626 {
   4627   /* instr[31]    = 0
   4628      instr[30]    = Q
   4629      instr[29,23] = 00 1110 0
   4630      instr[22]    = float(0)/double(1)
   4631      instr[21,10] = 10 0001 1101 10
   4632      instr[9,5]   = Vn
   4633      instr[4,0]   = Vd.  */
   4634 
   4635   unsigned vn = INSTR (9, 5);
   4636   unsigned vd = INSTR (4, 0);
   4637   unsigned full = INSTR (30, 30);
   4638   unsigned size = INSTR (22, 22);
   4639   unsigned i;
   4640 
   4641   NYI_assert (29, 23, 0x1C);
   4642   NYI_assert (21, 10, 0x876);
   4643 
   4644   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4645   if (size)
   4646     {
   4647       if (! full)
   4648 	HALT_UNALLOC;
   4649 
   4650       for (i = 0; i < 2; i++)
   4651 	{
   4652 	  double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
   4653 	  aarch64_set_vec_double (cpu, vd, i, val);
   4654 	}
   4655     }
   4656   else
   4657     {
   4658       for (i = 0; i < (full ? 4 : 2); i++)
   4659 	{
   4660 	  float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
   4661 	  aarch64_set_vec_float (cpu, vd, i, val);
   4662 	}
   4663     }
   4664 }
   4665 
   4666 #define VEC_CMP(SOURCE, CMP)						\
   4667   do									\
   4668     {									\
   4669       switch (size)							\
   4670 	{								\
   4671 	case 0:								\
   4672 	  for (i = 0; i < (full ? 16 : 8); i++)				\
   4673 	    aarch64_set_vec_u8 (cpu, vd, i,				\
   4674 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
   4675 				CMP					\
   4676 				aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
   4677 				? -1 : 0);				\
   4678 	  return;							\
   4679 	case 1:								\
   4680 	  for (i = 0; i < (full ? 8 : 4); i++)				\
   4681 	    aarch64_set_vec_u16 (cpu, vd, i,				\
   4682 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
   4683 				 CMP					\
   4684 				 aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
   4685 				 ? -1 : 0);				\
   4686 	  return;							\
   4687 	case 2:								\
   4688 	  for (i = 0; i < (full ? 4 : 2); i++)				\
   4689 	    aarch64_set_vec_u32 (cpu, vd, i, \
   4690 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
   4691 				 CMP					\
   4692 				 aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
   4693 				 ? -1 : 0);				\
   4694 	  return;							\
   4695 	case 3:								\
   4696 	  if (! full)							\
   4697 	    HALT_UNALLOC;						\
   4698 	  for (i = 0; i < 2; i++)					\
   4699 	    aarch64_set_vec_u64 (cpu, vd, i, \
   4700 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
   4701 				 CMP					\
   4702 				 aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
   4703 				 ? -1ULL : 0);				\
   4704 	  return;							\
   4705 	}								\
   4706     }									\
   4707   while (0)
   4708 
   4709 #define VEC_CMP0(SOURCE, CMP)						\
   4710   do									\
   4711     {									\
   4712       switch (size)							\
   4713 	{								\
   4714 	case 0:								\
   4715 	  for (i = 0; i < (full ? 16 : 8); i++)				\
   4716 	    aarch64_set_vec_u8 (cpu, vd, i,				\
   4717 				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
   4718 				CMP 0 ? -1 : 0);			\
   4719 	  return;							\
   4720 	case 1:								\
   4721 	  for (i = 0; i < (full ? 8 : 4); i++)				\
   4722 	    aarch64_set_vec_u16 (cpu, vd, i,				\
   4723 				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
   4724 				 CMP 0 ? -1 : 0);			\
   4725 	  return;							\
   4726 	case 2:								\
   4727 	  for (i = 0; i < (full ? 4 : 2); i++)				\
   4728 	    aarch64_set_vec_u32 (cpu, vd, i,				\
   4729 				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
   4730 				 CMP 0 ? -1 : 0);			\
   4731 	  return;							\
   4732 	case 3:								\
   4733 	  if (! full)							\
   4734 	    HALT_UNALLOC;						\
   4735 	  for (i = 0; i < 2; i++)					\
   4736 	    aarch64_set_vec_u64 (cpu, vd, i,				\
   4737 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
   4738 				 CMP 0 ? -1ULL : 0);			\
   4739 	  return;							\
   4740 	}								\
   4741     }									\
   4742   while (0)
   4743 
   4744 #define VEC_FCMP0(CMP)							\
   4745   do									\
   4746     {									\
   4747       if (vm != 0)							\
   4748 	HALT_NYI;							\
   4749       if (INSTR (22, 22))						\
   4750 	{								\
   4751 	  if (! full)							\
   4752 	    HALT_NYI;							\
   4753 	  for (i = 0; i < 2; i++)					\
   4754 	    aarch64_set_vec_u64 (cpu, vd, i,				\
   4755 				 aarch64_get_vec_double (cpu, vn, i)	\
   4756 				 CMP 0.0 ? -1 : 0);			\
   4757 	}								\
   4758       else								\
   4759 	{								\
   4760 	  for (i = 0; i < (full ? 4 : 2); i++)				\
   4761 	    aarch64_set_vec_u32 (cpu, vd, i,				\
   4762 				 aarch64_get_vec_float (cpu, vn, i)	\
   4763 				 CMP 0.0 ? -1 : 0);			\
   4764 	}								\
   4765       return;								\
   4766     }									\
   4767   while (0)
   4768 
   4769 #define VEC_FCMP(CMP)							\
   4770   do									\
   4771     {									\
   4772       if (INSTR (22, 22))						\
   4773 	{								\
   4774 	  if (! full)							\
   4775 	    HALT_NYI;							\
   4776 	  for (i = 0; i < 2; i++)					\
   4777 	    aarch64_set_vec_u64 (cpu, vd, i,				\
   4778 				 aarch64_get_vec_double (cpu, vn, i)	\
   4779 				 CMP					\
   4780 				 aarch64_get_vec_double (cpu, vm, i)	\
   4781 				 ? -1 : 0);				\
   4782 	}								\
   4783       else								\
   4784 	{								\
   4785 	  for (i = 0; i < (full ? 4 : 2); i++)				\
   4786 	    aarch64_set_vec_u32 (cpu, vd, i,				\
   4787 				 aarch64_get_vec_float (cpu, vn, i)	\
   4788 				 CMP					\
   4789 				 aarch64_get_vec_float (cpu, vm, i)	\
   4790 				 ? -1 : 0);				\
   4791 	}								\
   4792       return;								\
   4793     }									\
   4794   while (0)
   4795 
   4796 static void
   4797 do_vec_compare (sim_cpu *cpu)
   4798 {
   4799   /* instr[31]    = 0
   4800      instr[30]    = half(0)/full(1)
   4801      instr[29]    = part-of-comparison-type
   4802      instr[28,24] = 0 1110
   4803      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
   4804                     type of float compares: single (-0) / double (-1)
   4805      instr[21]    = 1
   4806      instr[20,16] = Vm or 00000 (compare vs 0)
   4807      instr[15,10] = part-of-comparison-type
   4808      instr[9,5]   = Vn
   4809      instr[4.0]   = Vd.  */
   4810 
   4811   int full = INSTR (30, 30);
   4812   int size = INSTR (23, 22);
   4813   unsigned vm = INSTR (20, 16);
   4814   unsigned vn = INSTR (9, 5);
   4815   unsigned vd = INSTR (4, 0);
   4816   unsigned i;
   4817 
   4818   NYI_assert (28, 24, 0x0E);
   4819   NYI_assert (21, 21, 1);
   4820 
   4821   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4822   if ((INSTR (11, 11)
   4823        && INSTR (14, 14))
   4824       || ((INSTR (11, 11) == 0
   4825 	   && INSTR (10, 10) == 0)))
   4826     {
   4827       /* A compare vs 0.  */
   4828       if (vm != 0)
   4829 	{
   4830 	  if (INSTR (15, 10) == 0x2A)
   4831 	    do_vec_maxv (cpu);
   4832 	  else if (INSTR (15, 10) == 0x32
   4833 		   || INSTR (15, 10) == 0x3E)
   4834 	    do_vec_fminmaxV (cpu);
   4835 	  else if (INSTR (29, 23) == 0x1C
   4836 		   && INSTR (21, 10) == 0x876)
   4837 	    do_vec_SCVTF (cpu);
   4838 	  else
   4839 	    HALT_NYI;
   4840 	  return;
   4841 	}
   4842     }
   4843 
   4844   if (INSTR (14, 14))
   4845     {
   4846       /* A floating point compare.  */
   4847       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
   4848 	| INSTR (13, 10);
   4849 
   4850       NYI_assert (15, 15, 1);
   4851 
   4852       switch (decode)
   4853 	{
   4854 	case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
   4855 	case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
   4856 	case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
   4857 	case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
   4858 	case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
   4859 	case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
   4860 	case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
   4861 	case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
   4862 
   4863 	default:
   4864 	  HALT_NYI;
   4865 	}
   4866     }
   4867   else
   4868     {
   4869       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
   4870 
   4871       switch (decode)
   4872 	{
   4873 	case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
   4874 	case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
   4875 	case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
   4876 	case 0x23: /* 0100011 TST */	VEC_CMP  (u, & );
   4877 	case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
   4878 	case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
   4879 	case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
   4880 	case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
   4881 	case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
   4882 	case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
   4883 	case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
   4884 	default:
   4885 	  if (vm == 0)
   4886 	    HALT_NYI;
   4887 	  do_vec_maxv (cpu);
   4888 	}
   4889     }
   4890 }
   4891 
   4892 static void
   4893 do_vec_SSHL (sim_cpu *cpu)
   4894 {
   4895   /* instr[31]    = 0
   4896      instr[30]    = first part (0)/ second part (1)
   4897      instr[29,24] = 00 1110
   4898      instr[23,22] = size: byte(00), half(01), word (10), long (11)
   4899      instr[21]    = 1
   4900      instr[20,16] = Vm
   4901      instr[15,10] = 0100 01
   4902      instr[9,5]   = Vn
   4903      instr[4,0]   = Vd.  */
   4904 
   4905   unsigned full = INSTR (30, 30);
   4906   unsigned vm = INSTR (20, 16);
   4907   unsigned vn = INSTR (9, 5);
   4908   unsigned vd = INSTR (4, 0);
   4909   unsigned i;
   4910   signed int shift;
   4911 
   4912   NYI_assert (29, 24, 0x0E);
   4913   NYI_assert (21, 21, 1);
   4914   NYI_assert (15, 10, 0x11);
   4915 
   4916   /* FIXME: What is a signed shift left in this context ?.  */
   4917 
   4918   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   4919   switch (INSTR (23, 22))
   4920     {
   4921     case 0:
   4922       for (i = 0; i < (full ? 16 : 8); i++)
   4923 	{
   4924 	  shift = aarch64_get_vec_s8 (cpu, vm, i);
   4925 	  if (shift >= 0)
   4926 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
   4927 				<< shift);
   4928 	  else
   4929 	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
   4930 				>> - shift);
   4931 	}
   4932       return;
   4933 
   4934     case 1:
   4935       for (i = 0; i < (full ? 8 : 4); i++)
   4936 	{
   4937 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
   4938 	  if (shift >= 0)
   4939 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
   4940 				 << shift);
   4941 	  else
   4942 	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
   4943 				 >> - shift);
   4944 	}
   4945       return;
   4946 
   4947     case 2:
   4948       for (i = 0; i < (full ? 4 : 2); i++)
   4949 	{
   4950 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
   4951 	  if (shift >= 0)
   4952 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
   4953 				 << shift);
   4954 	  else
   4955 	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
   4956 				 >> - shift);
   4957 	}
   4958       return;
   4959 
   4960     case 3:
   4961       if (! full)
   4962 	HALT_UNALLOC;
   4963       for (i = 0; i < 2; i++)
   4964 	{
   4965 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
   4966 	  if (shift >= 0)
   4967 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
   4968 				 << shift);
   4969 	  else
   4970 	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
   4971 				 >> - shift);
   4972 	}
   4973       return;
   4974     }
   4975 }
   4976 
   4977 static void
   4978 do_vec_USHL (sim_cpu *cpu)
   4979 {
   4980   /* instr[31]    = 0
   4981      instr[30]    = first part (0)/ second part (1)
   4982      instr[29,24] = 10 1110
   4983      instr[23,22] = size: byte(00), half(01), word (10), long (11)
   4984      instr[21]    = 1
   4985      instr[20,16] = Vm
   4986      instr[15,10] = 0100 01
   4987      instr[9,5]   = Vn
   4988      instr[4,0]   = Vd  */
   4989 
   4990   unsigned full = INSTR (30, 30);
   4991   unsigned vm = INSTR (20, 16);
   4992   unsigned vn = INSTR (9, 5);
   4993   unsigned vd = INSTR (4, 0);
   4994   unsigned i;
   4995   signed int shift;
   4996 
   4997   NYI_assert (29, 24, 0x2E);
   4998   NYI_assert (15, 10, 0x11);
   4999 
   5000   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5001   switch (INSTR (23, 22))
   5002     {
   5003     case 0:
   5004 	for (i = 0; i < (full ? 16 : 8); i++)
   5005 	  {
   5006 	    shift = aarch64_get_vec_s8 (cpu, vm, i);
   5007 	    if (shift >= 0)
   5008 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
   5009 				  << shift);
   5010 	    else
   5011 	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
   5012 				  >> - shift);
   5013 	  }
   5014       return;
   5015 
   5016     case 1:
   5017       for (i = 0; i < (full ? 8 : 4); i++)
   5018 	{
   5019 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
   5020 	  if (shift >= 0)
   5021 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
   5022 				 << shift);
   5023 	  else
   5024 	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
   5025 				 >> - shift);
   5026 	}
   5027       return;
   5028 
   5029     case 2:
   5030       for (i = 0; i < (full ? 4 : 2); i++)
   5031 	{
   5032 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
   5033 	  if (shift >= 0)
   5034 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
   5035 				 << shift);
   5036 	  else
   5037 	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
   5038 				 >> - shift);
   5039 	}
   5040       return;
   5041 
   5042     case 3:
   5043       if (! full)
   5044 	HALT_UNALLOC;
   5045       for (i = 0; i < 2; i++)
   5046 	{
   5047 	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
   5048 	  if (shift >= 0)
   5049 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
   5050 				 << shift);
   5051 	  else
   5052 	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
   5053 				 >> - shift);
   5054 	}
   5055       return;
   5056     }
   5057 }
   5058 
   5059 static void
   5060 do_vec_FMLA (sim_cpu *cpu)
   5061 {
   5062   /* instr[31]    = 0
   5063      instr[30]    = full/half selector
   5064      instr[29,23] = 0011100
   5065      instr[22]    = size: 0=>float, 1=>double
   5066      instr[21]    = 1
   5067      instr[20,16] = Vn
   5068      instr[15,10] = 1100 11
   5069      instr[9,5]   = Vm
   5070      instr[4.0]   = Vd.  */
   5071 
   5072   unsigned vm = INSTR (20, 16);
   5073   unsigned vn = INSTR (9, 5);
   5074   unsigned vd = INSTR (4, 0);
   5075   unsigned i;
   5076   int      full = INSTR (30, 30);
   5077 
   5078   NYI_assert (29, 23, 0x1C);
   5079   NYI_assert (21, 21, 1);
   5080   NYI_assert (15, 10, 0x33);
   5081 
   5082   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5083   if (INSTR (22, 22))
   5084     {
   5085       if (! full)
   5086 	HALT_UNALLOC;
   5087       for (i = 0; i < 2; i++)
   5088 	aarch64_set_vec_double (cpu, vd, i,
   5089 				aarch64_get_vec_double (cpu, vn, i) *
   5090 				aarch64_get_vec_double (cpu, vm, i) +
   5091 				aarch64_get_vec_double (cpu, vd, i));
   5092     }
   5093   else
   5094     {
   5095       for (i = 0; i < (full ? 4 : 2); i++)
   5096 	aarch64_set_vec_float (cpu, vd, i,
   5097 			       aarch64_get_vec_float (cpu, vn, i) *
   5098 			       aarch64_get_vec_float (cpu, vm, i) +
   5099 			       aarch64_get_vec_float (cpu, vd, i));
   5100     }
   5101 }
   5102 
   5103 static void
   5104 do_vec_max (sim_cpu *cpu)
   5105 {
   5106   /* instr[31]    = 0
   5107      instr[30]    = full/half selector
   5108      instr[29]    = SMAX (0) / UMAX (1)
   5109      instr[28,24] = 0 1110
   5110      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
   5111      instr[21]    = 1
   5112      instr[20,16] = Vn
   5113      instr[15,10] = 0110 01
   5114      instr[9,5]   = Vm
   5115      instr[4.0]   = Vd.  */
   5116 
   5117   unsigned vm = INSTR (20, 16);
   5118   unsigned vn = INSTR (9, 5);
   5119   unsigned vd = INSTR (4, 0);
   5120   unsigned i;
   5121   int      full = INSTR (30, 30);
   5122 
   5123   NYI_assert (28, 24, 0x0E);
   5124   NYI_assert (21, 21, 1);
   5125   NYI_assert (15, 10, 0x19);
   5126 
   5127   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5128   if (INSTR (29, 29))
   5129     {
   5130       switch (INSTR (23, 22))
   5131 	{
   5132 	case 0:
   5133 	  for (i = 0; i < (full ? 16 : 8); i++)
   5134 	    aarch64_set_vec_u8 (cpu, vd, i,
   5135 				aarch64_get_vec_u8 (cpu, vn, i)
   5136 				> aarch64_get_vec_u8 (cpu, vm, i)
   5137 				? aarch64_get_vec_u8 (cpu, vn, i)
   5138 				: aarch64_get_vec_u8 (cpu, vm, i));
   5139 	  return;
   5140 
   5141 	case 1:
   5142 	  for (i = 0; i < (full ? 8 : 4); i++)
   5143 	    aarch64_set_vec_u16 (cpu, vd, i,
   5144 				 aarch64_get_vec_u16 (cpu, vn, i)
   5145 				 > aarch64_get_vec_u16 (cpu, vm, i)
   5146 				 ? aarch64_get_vec_u16 (cpu, vn, i)
   5147 				 : aarch64_get_vec_u16 (cpu, vm, i));
   5148 	  return;
   5149 
   5150 	case 2:
   5151 	  for (i = 0; i < (full ? 4 : 2); i++)
   5152 	    aarch64_set_vec_u32 (cpu, vd, i,
   5153 				 aarch64_get_vec_u32 (cpu, vn, i)
   5154 				 > aarch64_get_vec_u32 (cpu, vm, i)
   5155 				 ? aarch64_get_vec_u32 (cpu, vn, i)
   5156 				 : aarch64_get_vec_u32 (cpu, vm, i));
   5157 	  return;
   5158 
   5159 	case 3:
   5160 	  HALT_UNALLOC;
   5161 	}
   5162     }
   5163   else
   5164     {
   5165       switch (INSTR (23, 22))
   5166 	{
   5167 	case 0:
   5168 	  for (i = 0; i < (full ? 16 : 8); i++)
   5169 	    aarch64_set_vec_s8 (cpu, vd, i,
   5170 				aarch64_get_vec_s8 (cpu, vn, i)
   5171 				> aarch64_get_vec_s8 (cpu, vm, i)
   5172 				? aarch64_get_vec_s8 (cpu, vn, i)
   5173 				: aarch64_get_vec_s8 (cpu, vm, i));
   5174 	  return;
   5175 
   5176 	case 1:
   5177 	  for (i = 0; i < (full ? 8 : 4); i++)
   5178 	    aarch64_set_vec_s16 (cpu, vd, i,
   5179 				 aarch64_get_vec_s16 (cpu, vn, i)
   5180 				 > aarch64_get_vec_s16 (cpu, vm, i)
   5181 				 ? aarch64_get_vec_s16 (cpu, vn, i)
   5182 				 : aarch64_get_vec_s16 (cpu, vm, i));
   5183 	  return;
   5184 
   5185 	case 2:
   5186 	  for (i = 0; i < (full ? 4 : 2); i++)
   5187 	    aarch64_set_vec_s32 (cpu, vd, i,
   5188 				 aarch64_get_vec_s32 (cpu, vn, i)
   5189 				 > aarch64_get_vec_s32 (cpu, vm, i)
   5190 				 ? aarch64_get_vec_s32 (cpu, vn, i)
   5191 				 : aarch64_get_vec_s32 (cpu, vm, i));
   5192 	  return;
   5193 
   5194 	case 3:
   5195 	  HALT_UNALLOC;
   5196 	}
   5197     }
   5198 }
   5199 
   5200 static void
   5201 do_vec_min (sim_cpu *cpu)
   5202 {
   5203   /* instr[31]    = 0
   5204      instr[30]    = full/half selector
   5205      instr[29]    = SMIN (0) / UMIN (1)
   5206      instr[28,24] = 0 1110
   5207      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
   5208      instr[21]    = 1
   5209      instr[20,16] = Vn
   5210      instr[15,10] = 0110 11
   5211      instr[9,5]   = Vm
   5212      instr[4.0]   = Vd.  */
   5213 
   5214   unsigned vm = INSTR (20, 16);
   5215   unsigned vn = INSTR (9, 5);
   5216   unsigned vd = INSTR (4, 0);
   5217   unsigned i;
   5218   int      full = INSTR (30, 30);
   5219 
   5220   NYI_assert (28, 24, 0x0E);
   5221   NYI_assert (21, 21, 1);
   5222   NYI_assert (15, 10, 0x1B);
   5223 
   5224   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5225   if (INSTR (29, 29))
   5226     {
   5227       switch (INSTR (23, 22))
   5228 	{
   5229 	case 0:
   5230 	  for (i = 0; i < (full ? 16 : 8); i++)
   5231 	    aarch64_set_vec_u8 (cpu, vd, i,
   5232 				aarch64_get_vec_u8 (cpu, vn, i)
   5233 				< aarch64_get_vec_u8 (cpu, vm, i)
   5234 				? aarch64_get_vec_u8 (cpu, vn, i)
   5235 				: aarch64_get_vec_u8 (cpu, vm, i));
   5236 	  return;
   5237 
   5238 	case 1:
   5239 	  for (i = 0; i < (full ? 8 : 4); i++)
   5240 	    aarch64_set_vec_u16 (cpu, vd, i,
   5241 				 aarch64_get_vec_u16 (cpu, vn, i)
   5242 				 < aarch64_get_vec_u16 (cpu, vm, i)
   5243 				 ? aarch64_get_vec_u16 (cpu, vn, i)
   5244 				 : aarch64_get_vec_u16 (cpu, vm, i));
   5245 	  return;
   5246 
   5247 	case 2:
   5248 	  for (i = 0; i < (full ? 4 : 2); i++)
   5249 	    aarch64_set_vec_u32 (cpu, vd, i,
   5250 				 aarch64_get_vec_u32 (cpu, vn, i)
   5251 				 < aarch64_get_vec_u32 (cpu, vm, i)
   5252 				 ? aarch64_get_vec_u32 (cpu, vn, i)
   5253 				 : aarch64_get_vec_u32 (cpu, vm, i));
   5254 	  return;
   5255 
   5256 	case 3:
   5257 	  HALT_UNALLOC;
   5258 	}
   5259     }
   5260   else
   5261     {
   5262       switch (INSTR (23, 22))
   5263 	{
   5264 	case 0:
   5265 	  for (i = 0; i < (full ? 16 : 8); i++)
   5266 	    aarch64_set_vec_s8 (cpu, vd, i,
   5267 				aarch64_get_vec_s8 (cpu, vn, i)
   5268 				< aarch64_get_vec_s8 (cpu, vm, i)
   5269 				? aarch64_get_vec_s8 (cpu, vn, i)
   5270 				: aarch64_get_vec_s8 (cpu, vm, i));
   5271 	  return;
   5272 
   5273 	case 1:
   5274 	  for (i = 0; i < (full ? 8 : 4); i++)
   5275 	    aarch64_set_vec_s16 (cpu, vd, i,
   5276 				 aarch64_get_vec_s16 (cpu, vn, i)
   5277 				 < aarch64_get_vec_s16 (cpu, vm, i)
   5278 				 ? aarch64_get_vec_s16 (cpu, vn, i)
   5279 				 : aarch64_get_vec_s16 (cpu, vm, i));
   5280 	  return;
   5281 
   5282 	case 2:
   5283 	  for (i = 0; i < (full ? 4 : 2); i++)
   5284 	    aarch64_set_vec_s32 (cpu, vd, i,
   5285 				 aarch64_get_vec_s32 (cpu, vn, i)
   5286 				 < aarch64_get_vec_s32 (cpu, vm, i)
   5287 				 ? aarch64_get_vec_s32 (cpu, vn, i)
   5288 				 : aarch64_get_vec_s32 (cpu, vm, i));
   5289 	  return;
   5290 
   5291 	case 3:
   5292 	  HALT_UNALLOC;
   5293 	}
   5294     }
   5295 }
   5296 
   5297 static void
   5298 do_vec_sub_long (sim_cpu *cpu)
   5299 {
   5300   /* instr[31]    = 0
   5301      instr[30]    = lower (0) / upper (1)
   5302      instr[29]    = signed (0) / unsigned (1)
   5303      instr[28,24] = 0 1110
   5304      instr[23,22] = size: bytes (00), half (01), word (10)
   5305      instr[21]    = 1
   5306      insrt[20,16] = Vm
   5307      instr[15,10] = 0010 00
   5308      instr[9,5]   = Vn
   5309      instr[4,0]   = V dest.  */
   5310 
   5311   unsigned size = INSTR (23, 22);
   5312   unsigned vm = INSTR (20, 16);
   5313   unsigned vn = INSTR (9, 5);
   5314   unsigned vd = INSTR (4, 0);
   5315   unsigned bias = 0;
   5316   unsigned i;
   5317 
   5318   NYI_assert (28, 24, 0x0E);
   5319   NYI_assert (21, 21, 1);
   5320   NYI_assert (15, 10, 0x08);
   5321 
   5322   if (size == 3)
   5323     HALT_UNALLOC;
   5324 
   5325   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5326   switch (INSTR (30, 29))
   5327     {
   5328     case 2: /* SSUBL2.  */
   5329       bias = 2;
   5330     case 0: /* SSUBL.  */
   5331       switch (size)
   5332 	{
   5333 	case 0:
   5334 	  bias *= 3;
   5335 	  for (i = 0; i < 8; i++)
   5336 	    aarch64_set_vec_s16 (cpu, vd, i,
   5337 				 aarch64_get_vec_s8 (cpu, vn, i + bias)
   5338 				 - aarch64_get_vec_s8 (cpu, vm, i + bias));
   5339 	  break;
   5340 
   5341 	case 1:
   5342 	  bias *= 2;
   5343 	  for (i = 0; i < 4; i++)
   5344 	    aarch64_set_vec_s32 (cpu, vd, i,
   5345 				 aarch64_get_vec_s16 (cpu, vn, i + bias)
   5346 				 - aarch64_get_vec_s16 (cpu, vm, i + bias));
   5347 	  break;
   5348 
   5349 	case 2:
   5350 	  for (i = 0; i < 2; i++)
   5351 	    aarch64_set_vec_s64 (cpu, vd, i,
   5352 				 aarch64_get_vec_s32 (cpu, vn, i + bias)
   5353 				 - aarch64_get_vec_s32 (cpu, vm, i + bias));
   5354 	  break;
   5355 
   5356 	default:
   5357 	  HALT_UNALLOC;
   5358 	}
   5359       break;
   5360 
   5361     case 3: /* USUBL2.  */
   5362       bias = 2;
   5363     case 1: /* USUBL.  */
   5364       switch (size)
   5365 	{
   5366 	case 0:
   5367 	  bias *= 3;
   5368 	  for (i = 0; i < 8; i++)
   5369 	    aarch64_set_vec_u16 (cpu, vd, i,
   5370 				 aarch64_get_vec_u8 (cpu, vn, i + bias)
   5371 				 - aarch64_get_vec_u8 (cpu, vm, i + bias));
   5372 	  break;
   5373 
   5374 	case 1:
   5375 	  bias *= 2;
   5376 	  for (i = 0; i < 4; i++)
   5377 	    aarch64_set_vec_u32 (cpu, vd, i,
   5378 				 aarch64_get_vec_u16 (cpu, vn, i + bias)
   5379 				 - aarch64_get_vec_u16 (cpu, vm, i + bias));
   5380 	  break;
   5381 
   5382 	case 2:
   5383 	  for (i = 0; i < 2; i++)
   5384 	    aarch64_set_vec_u64 (cpu, vd, i,
   5385 				 aarch64_get_vec_u32 (cpu, vn, i + bias)
   5386 				 - aarch64_get_vec_u32 (cpu, vm, i + bias));
   5387 	  break;
   5388 
   5389 	default:
   5390 	  HALT_UNALLOC;
   5391 	}
   5392       break;
   5393     }
   5394 }
   5395 
   5396 static void
   5397 do_vec_ADDP (sim_cpu *cpu)
   5398 {
   5399   /* instr[31]    = 0
   5400      instr[30]    = half(0)/full(1)
   5401      instr[29,24] = 00 1110
   5402      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
   5403      instr[21]    = 1
   5404      insrt[20,16] = Vm
   5405      instr[15,10] = 1011 11
   5406      instr[9,5]   = Vn
   5407      instr[4,0]   = V dest.  */
   5408 
   5409   FRegister copy_vn;
   5410   FRegister copy_vm;
   5411   unsigned full = INSTR (30, 30);
   5412   unsigned size = INSTR (23, 22);
   5413   unsigned vm = INSTR (20, 16);
   5414   unsigned vn = INSTR (9, 5);
   5415   unsigned vd = INSTR (4, 0);
   5416   unsigned i, range;
   5417 
   5418   NYI_assert (29, 24, 0x0E);
   5419   NYI_assert (21, 21, 1);
   5420   NYI_assert (15, 10, 0x2F);
   5421 
   5422   /* Make copies of the source registers in case vd == vn/vm.  */
   5423   copy_vn = cpu->fr[vn];
   5424   copy_vm = cpu->fr[vm];
   5425 
   5426   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5427   switch (size)
   5428     {
   5429     case 0:
   5430       range = full ? 8 : 4;
   5431       for (i = 0; i < range; i++)
   5432 	{
   5433 	  aarch64_set_vec_u8 (cpu, vd, i,
   5434 			      copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
   5435 	  aarch64_set_vec_u8 (cpu, vd, i + range,
   5436 			      copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
   5437 	}
   5438       return;
   5439 
   5440     case 1:
   5441       range = full ? 4 : 2;
   5442       for (i = 0; i < range; i++)
   5443 	{
   5444 	  aarch64_set_vec_u16 (cpu, vd, i,
   5445 			       copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
   5446 	  aarch64_set_vec_u16 (cpu, vd, i + range,
   5447 			       copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
   5448 	}
   5449       return;
   5450 
   5451     case 2:
   5452       range = full ? 2 : 1;
   5453       for (i = 0; i < range; i++)
   5454 	{
   5455 	  aarch64_set_vec_u32 (cpu, vd, i,
   5456 			       copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
   5457 	  aarch64_set_vec_u32 (cpu, vd, i + range,
   5458 			       copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
   5459 	}
   5460       return;
   5461 
   5462     case 3:
   5463       if (! full)
   5464 	HALT_UNALLOC;
   5465       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
   5466       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
   5467       return;
   5468     }
   5469 }
   5470 
   5471 /* Float point vector convert to longer (precision).  */
   5472 static void
   5473 do_vec_FCVTL (sim_cpu *cpu)
   5474 {
   5475   /* instr[31]    = 0
   5476      instr[30]    = half (0) / all (1)
   5477      instr[29,23] = 00 1110 0
   5478      instr[22]    = single (0) / double (1)
   5479      instr[21,10] = 10 0001 0111 10
   5480      instr[9,5]   = Rn
   5481      instr[4,0]   = Rd.  */
   5482 
   5483   unsigned rn = INSTR (9, 5);
   5484   unsigned rd = INSTR (4, 0);
   5485   unsigned full = INSTR (30, 30);
   5486   unsigned i;
   5487 
   5488   NYI_assert (31, 31, 0);
   5489   NYI_assert (29, 23, 0x1C);
   5490   NYI_assert (21, 10, 0x85E);
   5491 
   5492   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5493   if (INSTR (22, 22))
   5494     {
   5495       for (i = 0; i < 2; i++)
   5496 	aarch64_set_vec_double (cpu, rd, i,
   5497 				aarch64_get_vec_float (cpu, rn, i + 2*full));
   5498     }
   5499   else
   5500     {
   5501       HALT_NYI;
   5502 
   5503 #if 0
   5504       /* TODO: Implement missing half-float support.  */
   5505       for (i = 0; i < 4; i++)
   5506 	aarch64_set_vec_float (cpu, rd, i,
   5507 			     aarch64_get_vec_halffloat (cpu, rn, i + 4*full));
   5508 #endif
   5509     }
   5510 }
   5511 
   5512 static void
   5513 do_vec_FABS (sim_cpu *cpu)
   5514 {
   5515   /* instr[31]    = 0
   5516      instr[30]    = half(0)/full(1)
   5517      instr[29,23] = 00 1110 1
   5518      instr[22]    = float(0)/double(1)
   5519      instr[21,16] = 10 0000
   5520      instr[15,10] = 1111 10
   5521      instr[9,5]   = Vn
   5522      instr[4,0]   = Vd.  */
   5523 
   5524   unsigned vn = INSTR (9, 5);
   5525   unsigned vd = INSTR (4, 0);
   5526   unsigned full = INSTR (30, 30);
   5527   unsigned i;
   5528 
   5529   NYI_assert (29, 23, 0x1D);
   5530   NYI_assert (21, 10, 0x83E);
   5531 
   5532   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5533   if (INSTR (22, 22))
   5534     {
   5535       if (! full)
   5536 	HALT_NYI;
   5537 
   5538       for (i = 0; i < 2; i++)
   5539 	aarch64_set_vec_double (cpu, vd, i,
   5540 				fabs (aarch64_get_vec_double (cpu, vn, i)));
   5541     }
   5542   else
   5543     {
   5544       for (i = 0; i < (full ? 4 : 2); i++)
   5545 	aarch64_set_vec_float (cpu, vd, i,
   5546 			       fabsf (aarch64_get_vec_float (cpu, vn, i)));
   5547     }
   5548 }
   5549 
   5550 static void
   5551 do_vec_FCVTZS (sim_cpu *cpu)
   5552 {
   5553   /* instr[31]    = 0
   5554      instr[30]    = half (0) / all (1)
   5555      instr[29,23] = 00 1110 1
   5556      instr[22]    = single (0) / double (1)
   5557      instr[21,10] = 10 0001 1011 10
   5558      instr[9,5]   = Rn
   5559      instr[4,0]   = Rd.  */
   5560 
   5561   unsigned rn = INSTR (9, 5);
   5562   unsigned rd = INSTR (4, 0);
   5563   unsigned full = INSTR (30, 30);
   5564   unsigned i;
   5565 
   5566   NYI_assert (31, 31, 0);
   5567   NYI_assert (29, 23, 0x1D);
   5568   NYI_assert (21, 10, 0x86E);
   5569 
   5570   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5571   if (INSTR (22, 22))
   5572     {
   5573       if (! full)
   5574 	HALT_UNALLOC;
   5575 
   5576       for (i = 0; i < 2; i++)
   5577 	aarch64_set_vec_s64 (cpu, rd, i,
   5578 			     (int64_t) aarch64_get_vec_double (cpu, rn, i));
   5579     }
   5580   else
   5581     for (i = 0; i < (full ? 4 : 2); i++)
   5582       aarch64_set_vec_s32 (cpu, rd, i,
   5583 			   (int32_t) aarch64_get_vec_float (cpu, rn, i));
   5584 }
   5585 
   5586 static void
   5587 do_vec_REV64 (sim_cpu *cpu)
   5588 {
   5589   /* instr[31]    = 0
   5590      instr[30]    = full/half
   5591      instr[29,24] = 00 1110
   5592      instr[23,22] = size
   5593      instr[21,10] = 10 0000 0000 10
   5594      instr[9,5]   = Rn
   5595      instr[4,0]   = Rd.  */
   5596 
   5597   unsigned rn = INSTR (9, 5);
   5598   unsigned rd = INSTR (4, 0);
   5599   unsigned size = INSTR (23, 22);
   5600   unsigned full = INSTR (30, 30);
   5601   unsigned i;
   5602   FRegister val;
   5603 
   5604   NYI_assert (29, 24, 0x0E);
   5605   NYI_assert (21, 10, 0x802);
   5606 
   5607   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5608   switch (size)
   5609     {
   5610     case 0:
   5611       for (i = 0; i < (full ? 16 : 8); i++)
   5612 	val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
   5613       break;
   5614 
   5615     case 1:
   5616       for (i = 0; i < (full ? 8 : 4); i++)
   5617 	val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
   5618       break;
   5619 
   5620     case 2:
   5621       for (i = 0; i < (full ? 4 : 2); i++)
   5622 	val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
   5623       break;
   5624 
   5625     case 3:
   5626       HALT_UNALLOC;
   5627     }
   5628 
   5629   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
   5630   if (full)
   5631     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
   5632 }
   5633 
   5634 static void
   5635 do_vec_REV16 (sim_cpu *cpu)
   5636 {
   5637   /* instr[31]    = 0
   5638      instr[30]    = full/half
   5639      instr[29,24] = 00 1110
   5640      instr[23,22] = size
   5641      instr[21,10] = 10 0000 0001 10
   5642      instr[9,5]   = Rn
   5643      instr[4,0]   = Rd.  */
   5644 
   5645   unsigned rn = INSTR (9, 5);
   5646   unsigned rd = INSTR (4, 0);
   5647   unsigned size = INSTR (23, 22);
   5648   unsigned full = INSTR (30, 30);
   5649   unsigned i;
   5650   FRegister val;
   5651 
   5652   NYI_assert (29, 24, 0x0E);
   5653   NYI_assert (21, 10, 0x806);
   5654 
   5655   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5656   switch (size)
   5657     {
   5658     case 0:
   5659       for (i = 0; i < (full ? 16 : 8); i++)
   5660 	val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
   5661       break;
   5662 
   5663     default:
   5664       HALT_UNALLOC;
   5665     }
   5666 
   5667   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
   5668   if (full)
   5669     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
   5670 }
   5671 
   5672 static void
   5673 do_vec_op1 (sim_cpu *cpu)
   5674 {
   5675   /* instr[31]    = 0
   5676      instr[30]    = half/full
   5677      instr[29,24] = 00 1110
   5678      instr[23,21] = ???
   5679      instr[20,16] = Vm
   5680      instr[15,10] = sub-opcode
   5681      instr[9,5]   = Vn
   5682      instr[4,0]   = Vd  */
   5683   NYI_assert (29, 24, 0x0E);
   5684 
   5685   if (INSTR (21, 21) == 0)
   5686     {
   5687       if (INSTR (23, 22) == 0)
   5688 	{
   5689 	  if (INSTR (30, 30) == 1
   5690 	      && INSTR (17, 14) == 0
   5691 	      && INSTR (12, 10) == 7)
   5692 	    return do_vec_ins_2 (cpu);
   5693 
   5694 	  switch (INSTR (15, 10))
   5695 	    {
   5696 	    case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
   5697 	    case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
   5698 	    case 0x07: do_vec_INS (cpu); return;
   5699 	    case 0x0B: do_vec_SMOV_into_scalar (cpu); return;
   5700 	    case 0x0F: do_vec_UMOV_into_scalar (cpu); return;
   5701 
   5702 	    case 0x00:
   5703 	    case 0x08:
   5704 	    case 0x10:
   5705 	    case 0x18:
   5706 	      do_vec_TBL (cpu); return;
   5707 
   5708 	    case 0x06:
   5709 	    case 0x16:
   5710 	      do_vec_UZP (cpu); return;
   5711 
   5712 	    case 0x0A: do_vec_TRN (cpu); return;
   5713 
   5714 	    case 0x0E:
   5715 	    case 0x1E:
   5716 	      do_vec_ZIP (cpu); return;
   5717 
   5718 	    default:
   5719 	      HALT_NYI;
   5720 	    }
   5721 	}
   5722 
   5723       switch (INSTR (13, 10))
   5724 	{
   5725 	case 0x6: do_vec_UZP (cpu); return;
   5726 	case 0xE: do_vec_ZIP (cpu); return;
   5727 	case 0xA: do_vec_TRN (cpu); return;
   5728 	default:  HALT_NYI;
   5729 	}
   5730     }
   5731 
   5732   switch (INSTR (15, 10))
   5733     {
   5734     case 0x02: do_vec_REV64 (cpu); return;
   5735     case 0x06: do_vec_REV16 (cpu); return;
   5736 
   5737     case 0x07:
   5738       switch (INSTR (23, 21))
   5739 	{
   5740 	case 1: do_vec_AND (cpu); return;
   5741 	case 3: do_vec_BIC (cpu); return;
   5742 	case 5: do_vec_ORR (cpu); return;
   5743 	case 7: do_vec_ORN (cpu); return;
   5744 	default: HALT_NYI;
   5745 	}
   5746 
   5747     case 0x08: do_vec_sub_long (cpu); return;
   5748     case 0x0a: do_vec_XTN (cpu); return;
   5749     case 0x11: do_vec_SSHL (cpu); return;
   5750     case 0x16: do_vec_CNT (cpu); return;
   5751     case 0x19: do_vec_max (cpu); return;
   5752     case 0x1B: do_vec_min (cpu); return;
   5753     case 0x21: do_vec_add (cpu); return;
   5754     case 0x25: do_vec_MLA (cpu); return;
   5755     case 0x27: do_vec_mul (cpu); return;
   5756     case 0x2F: do_vec_ADDP (cpu); return;
   5757     case 0x30: do_vec_mull (cpu); return;
   5758     case 0x33: do_vec_FMLA (cpu); return;
   5759     case 0x35: do_vec_fadd (cpu); return;
   5760 
   5761     case 0x1E:
   5762       switch (INSTR (20, 16))
   5763 	{
   5764 	case 0x01: do_vec_FCVTL (cpu); return;
   5765 	default: HALT_NYI;
   5766 	}
   5767 
   5768     case 0x2E:
   5769       switch (INSTR (20, 16))
   5770 	{
   5771 	case 0x00: do_vec_ABS (cpu); return;
   5772 	case 0x01: do_vec_FCVTZS (cpu); return;
   5773 	case 0x11: do_vec_ADDV (cpu); return;
   5774 	default: HALT_NYI;
   5775 	}
   5776 
   5777     case 0x31:
   5778     case 0x3B:
   5779       do_vec_Fminmax (cpu); return;
   5780 
   5781     case 0x0D:
   5782     case 0x0F:
   5783     case 0x22:
   5784     case 0x23:
   5785     case 0x26:
   5786     case 0x2A:
   5787     case 0x32:
   5788     case 0x36:
   5789     case 0x39:
   5790     case 0x3A:
   5791       do_vec_compare (cpu); return;
   5792 
   5793     case 0x3E:
   5794       do_vec_FABS (cpu); return;
   5795 
   5796     default:
   5797       HALT_NYI;
   5798     }
   5799 }
   5800 
   5801 static void
   5802 do_vec_xtl (sim_cpu *cpu)
   5803 {
   5804   /* instr[31]    = 0
   5805      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
   5806      instr[28,22] = 0 1111 00
   5807      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
   5808      instr[15,10] = 1010 01
   5809      instr[9,5]   = V source
   5810      instr[4,0]   = V dest.  */
   5811 
   5812   unsigned vs = INSTR (9, 5);
   5813   unsigned vd = INSTR (4, 0);
   5814   unsigned i, shift, bias = 0;
   5815 
   5816   NYI_assert (28, 22, 0x3C);
   5817   NYI_assert (15, 10, 0x29);
   5818 
   5819   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5820   switch (INSTR (30, 29))
   5821     {
   5822     case 2: /* SXTL2, SSHLL2.  */
   5823       bias = 2;
   5824     case 0: /* SXTL, SSHLL.  */
   5825       if (INSTR (21, 21))
   5826 	{
   5827 	  int64_t val1, val2;
   5828 
   5829 	  shift = INSTR (20, 16);
   5830 	  /* Get the source values before setting the destination values
   5831 	     in case the source and destination are the same.  */
   5832 	  val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
   5833 	  val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
   5834 	  aarch64_set_vec_s64 (cpu, vd, 0, val1);
   5835 	  aarch64_set_vec_s64 (cpu, vd, 1, val2);
   5836 	}
   5837       else if (INSTR (20, 20))
   5838 	{
   5839 	  int32_t v[4];
   5840 	  int32_t v1,v2,v3,v4;
   5841 
   5842 	  shift = INSTR (19, 16);
   5843 	  bias *= 2;
   5844 	  for (i = 0; i < 4; i++)
   5845 	    v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
   5846 	  for (i = 0; i < 4; i++)
   5847 	    aarch64_set_vec_s32 (cpu, vd, i, v[i]);
   5848 	}
   5849       else
   5850 	{
   5851 	  int16_t v[8];
   5852 	  NYI_assert (19, 19, 1);
   5853 
   5854 	  shift = INSTR (18, 16);
   5855 	  bias *= 4;
   5856 	  for (i = 0; i < 8; i++)
   5857 	    v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
   5858 	  for (i = 0; i < 8; i++)
   5859 	    aarch64_set_vec_s16 (cpu, vd, i, v[i]);
   5860 	}
   5861       return;
   5862 
   5863     case 3: /* UXTL2, USHLL2.  */
   5864       bias = 2;
   5865     case 1: /* UXTL, USHLL.  */
   5866       if (INSTR (21, 21))
   5867 	{
   5868 	  uint64_t v1, v2;
   5869 	  shift = INSTR (20, 16);
   5870 	  v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
   5871 	  v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
   5872 	  aarch64_set_vec_u64 (cpu, vd, 0, v1);
   5873 	  aarch64_set_vec_u64 (cpu, vd, 1, v2);
   5874 	}
   5875       else if (INSTR (20, 20))
   5876 	{
   5877 	  uint32_t v[4];
   5878 	  shift = INSTR (19, 16);
   5879 	  bias *= 2;
   5880 	  for (i = 0; i < 4; i++)
   5881 	    v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
   5882 	  for (i = 0; i < 4; i++)
   5883 	    aarch64_set_vec_u32 (cpu, vd, i, v[i]);
   5884 	}
   5885       else
   5886 	{
   5887 	  uint16_t v[8];
   5888 	  NYI_assert (19, 19, 1);
   5889 
   5890 	  shift = INSTR (18, 16);
   5891 	  bias *= 4;
   5892 	  for (i = 0; i < 8; i++)
   5893 	    v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
   5894 	  for (i = 0; i < 8; i++)
   5895 	    aarch64_set_vec_u16 (cpu, vd, i, v[i]);
   5896 	}
   5897       return;
   5898     }
   5899 }
   5900 
   5901 static void
   5902 do_vec_SHL (sim_cpu *cpu)
   5903 {
   5904   /* instr [31]    = 0
   5905      instr [30]    = half(0)/full(1)
   5906      instr [29,23] = 001 1110
   5907      instr [22,16] = size and shift amount
   5908      instr [15,10] = 01 0101
   5909      instr [9, 5]  = Vs
   5910      instr [4, 0]  = Vd.  */
   5911 
   5912   int shift;
   5913   int full    = INSTR (30, 30);
   5914   unsigned vs = INSTR (9, 5);
   5915   unsigned vd = INSTR (4, 0);
   5916   unsigned i;
   5917 
   5918   NYI_assert (29, 23, 0x1E);
   5919   NYI_assert (15, 10, 0x15);
   5920 
   5921   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5922   if (INSTR (22, 22))
   5923     {
   5924       shift = INSTR (21, 16);
   5925 
   5926       if (full == 0)
   5927 	HALT_UNALLOC;
   5928 
   5929       for (i = 0; i < 2; i++)
   5930 	{
   5931 	  uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
   5932 	  aarch64_set_vec_u64 (cpu, vd, i, val << shift);
   5933 	}
   5934 
   5935       return;
   5936     }
   5937 
   5938   if (INSTR (21, 21))
   5939     {
   5940       shift = INSTR (20, 16);
   5941 
   5942       for (i = 0; i < (full ? 4 : 2); i++)
   5943 	{
   5944 	  uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
   5945 	  aarch64_set_vec_u32 (cpu, vd, i, val << shift);
   5946 	}
   5947 
   5948       return;
   5949     }
   5950 
   5951   if (INSTR (20, 20))
   5952     {
   5953       shift = INSTR (19, 16);
   5954 
   5955       for (i = 0; i < (full ? 8 : 4); i++)
   5956 	{
   5957 	  uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
   5958 	  aarch64_set_vec_u16 (cpu, vd, i, val << shift);
   5959 	}
   5960 
   5961       return;
   5962     }
   5963 
   5964   if (INSTR (19, 19) == 0)
   5965     HALT_UNALLOC;
   5966 
   5967   shift = INSTR (18, 16);
   5968 
   5969   for (i = 0; i < (full ? 16 : 8); i++)
   5970     {
   5971       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
   5972       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
   5973     }
   5974 }
   5975 
   5976 static void
   5977 do_vec_SSHR_USHR (sim_cpu *cpu)
   5978 {
   5979   /* instr [31]    = 0
   5980      instr [30]    = half(0)/full(1)
   5981      instr [29]    = signed(0)/unsigned(1)
   5982      instr [28,23] = 0 1111 0
   5983      instr [22,16] = size and shift amount
   5984      instr [15,10] = 0000 01
   5985      instr [9, 5]  = Vs
   5986      instr [4, 0]  = Vd.  */
   5987 
   5988   int full       = INSTR (30, 30);
   5989   int sign       = ! INSTR (29, 29);
   5990   unsigned shift = INSTR (22, 16);
   5991   unsigned vs    = INSTR (9, 5);
   5992   unsigned vd    = INSTR (4, 0);
   5993   unsigned i;
   5994 
   5995   NYI_assert (28, 23, 0x1E);
   5996   NYI_assert (15, 10, 0x01);
   5997 
   5998   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   5999   if (INSTR (22, 22))
   6000     {
   6001       shift = 128 - shift;
   6002 
   6003       if (full == 0)
   6004 	HALT_UNALLOC;
   6005 
   6006       if (sign)
   6007 	for (i = 0; i < 2; i++)
   6008 	  {
   6009 	    int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
   6010 	    aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
   6011 	  }
   6012       else
   6013 	for (i = 0; i < 2; i++)
   6014 	  {
   6015 	    uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
   6016 	    aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
   6017 	  }
   6018 
   6019       return;
   6020     }
   6021 
   6022   if (INSTR (21, 21))
   6023     {
   6024       shift = 64 - shift;
   6025 
   6026       if (sign)
   6027 	for (i = 0; i < (full ? 4 : 2); i++)
   6028 	  {
   6029 	    int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
   6030 	    aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
   6031 	  }
   6032       else
   6033 	for (i = 0; i < (full ? 4 : 2); i++)
   6034 	  {
   6035 	    uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
   6036 	    aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
   6037 	  }
   6038 
   6039       return;
   6040     }
   6041 
   6042   if (INSTR (20, 20))
   6043     {
   6044       shift = 32 - shift;
   6045 
   6046       if (sign)
   6047 	for (i = 0; i < (full ? 8 : 4); i++)
   6048 	  {
   6049 	    int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
   6050 	    aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
   6051 	  }
   6052       else
   6053 	for (i = 0; i < (full ? 8 : 4); i++)
   6054 	  {
   6055 	    uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
   6056 	    aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
   6057 	  }
   6058 
   6059       return;
   6060     }
   6061 
   6062   if (INSTR (19, 19) == 0)
   6063     HALT_UNALLOC;
   6064 
   6065   shift = 16 - shift;
   6066 
   6067   if (sign)
   6068     for (i = 0; i < (full ? 16 : 8); i++)
   6069       {
   6070 	int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
   6071 	aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
   6072       }
   6073   else
   6074     for (i = 0; i < (full ? 16 : 8); i++)
   6075       {
   6076 	uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
   6077 	aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
   6078       }
   6079 }
   6080 
   6081 static void
   6082 do_vec_MUL_by_element (sim_cpu *cpu)
   6083 {
   6084   /* instr[31]    = 0
   6085      instr[30]    = half/full
   6086      instr[29,24] = 00 1111
   6087      instr[23,22] = size
   6088      instr[21]    = L
   6089      instr[20]    = M
   6090      instr[19,16] = m
   6091      instr[15,12] = 1000
   6092      instr[11]    = H
   6093      instr[10]    = 0
   6094      instr[9,5]   = Vn
   6095      instr[4,0]   = Vd  */
   6096 
   6097   unsigned full     = INSTR (30, 30);
   6098   unsigned L        = INSTR (21, 21);
   6099   unsigned H        = INSTR (11, 11);
   6100   unsigned vn       = INSTR (9, 5);
   6101   unsigned vd       = INSTR (4, 0);
   6102   unsigned size     = INSTR (23, 22);
   6103   unsigned index;
   6104   unsigned vm;
   6105   unsigned e;
   6106 
   6107   NYI_assert (29, 24, 0x0F);
   6108   NYI_assert (15, 12, 0x8);
   6109   NYI_assert (10, 10, 0);
   6110 
   6111   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6112   switch (size)
   6113     {
   6114     case 1:
   6115       {
   6116 	/* 16 bit products.  */
   6117 	uint16_t product;
   6118 	uint16_t element1;
   6119 	uint16_t element2;
   6120 
   6121 	index = (H << 2) | (L << 1) | INSTR (20, 20);
   6122 	vm = INSTR (19, 16);
   6123 	element2 = aarch64_get_vec_u16 (cpu, vm, index);
   6124 
   6125 	for (e = 0; e < (full ? 8 : 4); e ++)
   6126 	  {
   6127 	    element1 = aarch64_get_vec_u16 (cpu, vn, e);
   6128 	    product  = element1 * element2;
   6129 	    aarch64_set_vec_u16 (cpu, vd, e, product);
   6130 	  }
   6131       }
   6132       break;
   6133 
   6134     case 2:
   6135       {
   6136 	/* 32 bit products.  */
   6137 	uint32_t product;
   6138 	uint32_t element1;
   6139 	uint32_t element2;
   6140 
   6141 	index = (H << 1) | L;
   6142 	vm = INSTR (20, 16);
   6143 	element2 = aarch64_get_vec_u32 (cpu, vm, index);
   6144 
   6145 	for (e = 0; e < (full ? 4 : 2); e ++)
   6146 	  {
   6147 	    element1 = aarch64_get_vec_u32 (cpu, vn, e);
   6148 	    product  = element1 * element2;
   6149 	    aarch64_set_vec_u32 (cpu, vd, e, product);
   6150 	  }
   6151       }
   6152       break;
   6153 
   6154     default:
   6155       HALT_UNALLOC;
   6156     }
   6157 }
   6158 
   6159 static void
   6160 do_FMLA_by_element (sim_cpu *cpu)
   6161 {
   6162   /* instr[31]    = 0
   6163      instr[30]    = half/full
   6164      instr[29,23] = 00 1111 1
   6165      instr[22]    = size
   6166      instr[21]    = L
   6167      instr[20,16] = m
   6168      instr[15,12] = 0001
   6169      instr[11]    = H
   6170      instr[10]    = 0
   6171      instr[9,5]   = Vn
   6172      instr[4,0]   = Vd  */
   6173 
   6174   unsigned full     = INSTR (30, 30);
   6175   unsigned size     = INSTR (22, 22);
   6176   unsigned L        = INSTR (21, 21);
   6177   unsigned vm       = INSTR (20, 16);
   6178   unsigned H        = INSTR (11, 11);
   6179   unsigned vn       = INSTR (9, 5);
   6180   unsigned vd       = INSTR (4, 0);
   6181   unsigned e;
   6182 
   6183   NYI_assert (29, 23, 0x1F);
   6184   NYI_assert (15, 12, 0x1);
   6185   NYI_assert (10, 10, 0);
   6186 
   6187   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6188   if (size)
   6189     {
   6190       double element1, element2;
   6191 
   6192       if (! full || L)
   6193 	HALT_UNALLOC;
   6194 
   6195       element2 = aarch64_get_vec_double (cpu, vm, H);
   6196 
   6197       for (e = 0; e < 2; e++)
   6198 	{
   6199 	  element1 = aarch64_get_vec_double (cpu, vn, e);
   6200 	  element1 *= element2;
   6201 	  element1 += aarch64_get_vec_double (cpu, vd, e);
   6202 	  aarch64_set_vec_double (cpu, vd, e, element1);
   6203 	}
   6204     }
   6205   else
   6206     {
   6207       float element1;
   6208       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
   6209 
   6210       for (e = 0; e < (full ? 4 : 2); e++)
   6211 	{
   6212 	  element1 = aarch64_get_vec_float (cpu, vn, e);
   6213 	  element1 *= element2;
   6214 	  element1 += aarch64_get_vec_float (cpu, vd, e);
   6215 	  aarch64_set_vec_float (cpu, vd, e, element1);
   6216 	}
   6217     }
   6218 }
   6219 
   6220 static void
   6221 do_vec_op2 (sim_cpu *cpu)
   6222 {
   6223   /* instr[31]    = 0
   6224      instr[30]    = half/full
   6225      instr[29,24] = 00 1111
   6226      instr[23]    = ?
   6227      instr[22,16] = element size & index
   6228      instr[15,10] = sub-opcode
   6229      instr[9,5]   = Vm
   6230      instr[4,0]   = Vd  */
   6231 
   6232   NYI_assert (29, 24, 0x0F);
   6233 
   6234   if (INSTR (23, 23) != 0)
   6235     {
   6236       switch (INSTR (15, 10))
   6237 	{
   6238 	case 0x04:
   6239 	case 0x06:
   6240 	  do_FMLA_by_element (cpu);
   6241 	  return;
   6242 
   6243 	case 0x20:
   6244 	case 0x22:
   6245 	  do_vec_MUL_by_element (cpu);
   6246 	  return;
   6247 
   6248 	default:
   6249 	  HALT_NYI;
   6250 	}
   6251     }
   6252   else
   6253     {
   6254       switch (INSTR (15, 10))
   6255 	{
   6256 	case 0x01: do_vec_SSHR_USHR (cpu); return;
   6257 	case 0x15: do_vec_SHL (cpu); return;
   6258 	case 0x20:
   6259 	case 0x22: do_vec_MUL_by_element (cpu); return;
   6260 	case 0x29: do_vec_xtl (cpu); return;
   6261 	default:   HALT_NYI;
   6262 	}
   6263     }
   6264 }
   6265 
   6266 static void
   6267 do_vec_neg (sim_cpu *cpu)
   6268 {
   6269   /* instr[31]    = 0
   6270      instr[30]    = full(1)/half(0)
   6271      instr[29,24] = 10 1110
   6272      instr[23,22] = size: byte(00), half (01), word (10), long (11)
   6273      instr[21,10] = 1000 0010 1110
   6274      instr[9,5]   = Vs
   6275      instr[4,0]   = Vd  */
   6276 
   6277   int    full = INSTR (30, 30);
   6278   unsigned vs = INSTR (9, 5);
   6279   unsigned vd = INSTR (4, 0);
   6280   unsigned i;
   6281 
   6282   NYI_assert (29, 24, 0x2E);
   6283   NYI_assert (21, 10, 0x82E);
   6284 
   6285   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6286   switch (INSTR (23, 22))
   6287     {
   6288     case 0:
   6289       for (i = 0; i < (full ? 16 : 8); i++)
   6290 	aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
   6291       return;
   6292 
   6293     case 1:
   6294       for (i = 0; i < (full ? 8 : 4); i++)
   6295 	aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
   6296       return;
   6297 
   6298     case 2:
   6299       for (i = 0; i < (full ? 4 : 2); i++)
   6300 	aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
   6301       return;
   6302 
   6303     case 3:
   6304       if (! full)
   6305 	HALT_NYI;
   6306       for (i = 0; i < 2; i++)
   6307 	aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
   6308       return;
   6309     }
   6310 }
   6311 
   6312 static void
   6313 do_vec_sqrt (sim_cpu *cpu)
   6314 {
   6315   /* instr[31]    = 0
   6316      instr[30]    = full(1)/half(0)
   6317      instr[29,23] = 101 1101
   6318      instr[22]    = single(0)/double(1)
   6319      instr[21,10] = 1000 0111 1110
   6320      instr[9,5]   = Vs
   6321      instr[4,0]   = Vd.  */
   6322 
   6323   int    full = INSTR (30, 30);
   6324   unsigned vs = INSTR (9, 5);
   6325   unsigned vd = INSTR (4, 0);
   6326   unsigned i;
   6327 
   6328   NYI_assert (29, 23, 0x5B);
   6329   NYI_assert (21, 10, 0x87E);
   6330 
   6331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6332   if (INSTR (22, 22) == 0)
   6333     for (i = 0; i < (full ? 4 : 2); i++)
   6334       aarch64_set_vec_float (cpu, vd, i,
   6335 			     sqrtf (aarch64_get_vec_float (cpu, vs, i)));
   6336   else
   6337     for (i = 0; i < 2; i++)
   6338       aarch64_set_vec_double (cpu, vd, i,
   6339 			      sqrt (aarch64_get_vec_double (cpu, vs, i)));
   6340 }
   6341 
   6342 static void
   6343 do_vec_mls_indexed (sim_cpu *cpu)
   6344 {
   6345   /* instr[31]       = 0
   6346      instr[30]       = half(0)/full(1)
   6347      instr[29,24]    = 10 1111
   6348      instr[23,22]    = 16-bit(01)/32-bit(10)
   6349      instr[21,20+11] = index (if 16-bit)
   6350      instr[21+11]    = index (if 32-bit)
   6351      instr[20,16]    = Vm
   6352      instr[15,12]    = 0100
   6353      instr[11]       = part of index
   6354      instr[10]       = 0
   6355      instr[9,5]      = Vs
   6356      instr[4,0]      = Vd.  */
   6357 
   6358   int    full = INSTR (30, 30);
   6359   unsigned vs = INSTR (9, 5);
   6360   unsigned vd = INSTR (4, 0);
   6361   unsigned vm = INSTR (20, 16);
   6362   unsigned i;
   6363 
   6364   NYI_assert (15, 12, 4);
   6365   NYI_assert (10, 10, 0);
   6366 
   6367   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6368   switch (INSTR (23, 22))
   6369     {
   6370     case 1:
   6371       {
   6372 	unsigned elem;
   6373 	uint32_t val;
   6374 
   6375 	if (vm > 15)
   6376 	  HALT_NYI;
   6377 
   6378 	elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
   6379 	val = aarch64_get_vec_u16 (cpu, vm, elem);
   6380 
   6381 	for (i = 0; i < (full ? 8 : 4); i++)
   6382 	  aarch64_set_vec_u32 (cpu, vd, i,
   6383 			       aarch64_get_vec_u32 (cpu, vd, i) -
   6384 			       (aarch64_get_vec_u32 (cpu, vs, i) * val));
   6385 	return;
   6386       }
   6387 
   6388     case 2:
   6389       {
   6390 	unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
   6391 	uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
   6392 
   6393 	for (i = 0; i < (full ? 4 : 2); i++)
   6394 	  aarch64_set_vec_u64 (cpu, vd, i,
   6395 			       aarch64_get_vec_u64 (cpu, vd, i) -
   6396 			       (aarch64_get_vec_u64 (cpu, vs, i) * val));
   6397 	return;
   6398       }
   6399 
   6400     case 0:
   6401     case 3:
   6402     default:
   6403       HALT_NYI;
   6404     }
   6405 }
   6406 
   6407 static void
   6408 do_vec_SUB (sim_cpu *cpu)
   6409 {
   6410   /* instr [31]    = 0
   6411      instr [30]    = half(0)/full(1)
   6412      instr [29,24] = 10 1110
   6413      instr [23,22] = size: byte(00, half(01), word (10), long (11)
   6414      instr [21]    = 1
   6415      instr [20,16] = Vm
   6416      instr [15,10] = 10 0001
   6417      instr [9, 5]  = Vn
   6418      instr [4, 0]  = Vd.  */
   6419 
   6420   unsigned full = INSTR (30, 30);
   6421   unsigned vm = INSTR (20, 16);
   6422   unsigned vn = INSTR (9, 5);
   6423   unsigned vd = INSTR (4, 0);
   6424   unsigned i;
   6425 
   6426   NYI_assert (29, 24, 0x2E);
   6427   NYI_assert (21, 21, 1);
   6428   NYI_assert (15, 10, 0x21);
   6429 
   6430   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6431   switch (INSTR (23, 22))
   6432     {
   6433     case 0:
   6434       for (i = 0; i < (full ? 16 : 8); i++)
   6435 	aarch64_set_vec_s8 (cpu, vd, i,
   6436 			    aarch64_get_vec_s8 (cpu, vn, i)
   6437 			    - aarch64_get_vec_s8 (cpu, vm, i));
   6438       return;
   6439 
   6440     case 1:
   6441       for (i = 0; i < (full ? 8 : 4); i++)
   6442 	aarch64_set_vec_s16 (cpu, vd, i,
   6443 			     aarch64_get_vec_s16 (cpu, vn, i)
   6444 			     - aarch64_get_vec_s16 (cpu, vm, i));
   6445       return;
   6446 
   6447     case 2:
   6448       for (i = 0; i < (full ? 4 : 2); i++)
   6449 	aarch64_set_vec_s32 (cpu, vd, i,
   6450 			     aarch64_get_vec_s32 (cpu, vn, i)
   6451 			     - aarch64_get_vec_s32 (cpu, vm, i));
   6452       return;
   6453 
   6454     case 3:
   6455       if (full == 0)
   6456 	HALT_UNALLOC;
   6457 
   6458       for (i = 0; i < 2; i++)
   6459 	aarch64_set_vec_s64 (cpu, vd, i,
   6460 			     aarch64_get_vec_s64 (cpu, vn, i)
   6461 			     - aarch64_get_vec_s64 (cpu, vm, i));
   6462       return;
   6463     }
   6464 }
   6465 
   6466 static void
   6467 do_vec_MLS (sim_cpu *cpu)
   6468 {
   6469   /* instr [31]    = 0
   6470      instr [30]    = half(0)/full(1)
   6471      instr [29,24] = 10 1110
   6472      instr [23,22] = size: byte(00, half(01), word (10)
   6473      instr [21]    = 1
   6474      instr [20,16] = Vm
   6475      instr [15,10] = 10 0101
   6476      instr [9, 5]  = Vn
   6477      instr [4, 0]  = Vd.  */
   6478 
   6479   unsigned full = INSTR (30, 30);
   6480   unsigned vm = INSTR (20, 16);
   6481   unsigned vn = INSTR (9, 5);
   6482   unsigned vd = INSTR (4, 0);
   6483   unsigned i;
   6484 
   6485   NYI_assert (29, 24, 0x2E);
   6486   NYI_assert (21, 21, 1);
   6487   NYI_assert (15, 10, 0x25);
   6488 
   6489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6490   switch (INSTR (23, 22))
   6491     {
   6492     case 0:
   6493       for (i = 0; i < (full ? 16 : 8); i++)
   6494 	aarch64_set_vec_u8 (cpu, vd, i,
   6495 			    aarch64_get_vec_u8 (cpu, vd, i)
   6496 			    - (aarch64_get_vec_u8 (cpu, vn, i)
   6497 			       * aarch64_get_vec_u8 (cpu, vm, i)));
   6498       return;
   6499 
   6500     case 1:
   6501       for (i = 0; i < (full ? 8 : 4); i++)
   6502 	aarch64_set_vec_u16 (cpu, vd, i,
   6503 			     aarch64_get_vec_u16 (cpu, vd, i)
   6504 			     - (aarch64_get_vec_u16 (cpu, vn, i)
   6505 				* aarch64_get_vec_u16 (cpu, vm, i)));
   6506       return;
   6507 
   6508     case 2:
   6509       for (i = 0; i < (full ? 4 : 2); i++)
   6510 	aarch64_set_vec_u32 (cpu, vd, i,
   6511 			     aarch64_get_vec_u32 (cpu, vd, i)
   6512 			     - (aarch64_get_vec_u32 (cpu, vn, i)
   6513 				* aarch64_get_vec_u32 (cpu, vm, i)));
   6514       return;
   6515 
   6516     default:
   6517       HALT_UNALLOC;
   6518     }
   6519 }
   6520 
   6521 static void
   6522 do_vec_FDIV (sim_cpu *cpu)
   6523 {
   6524   /* instr [31]    = 0
   6525      instr [30]    = half(0)/full(1)
   6526      instr [29,23] = 10 1110 0
   6527      instr [22]    = float()/double(1)
   6528      instr [21]    = 1
   6529      instr [20,16] = Vm
   6530      instr [15,10] = 1111 11
   6531      instr [9, 5]  = Vn
   6532      instr [4, 0]  = Vd.  */
   6533 
   6534   unsigned full = INSTR (30, 30);
   6535   unsigned vm = INSTR (20, 16);
   6536   unsigned vn = INSTR (9, 5);
   6537   unsigned vd = INSTR (4, 0);
   6538   unsigned i;
   6539 
   6540   NYI_assert (29, 23, 0x5C);
   6541   NYI_assert (21, 21, 1);
   6542   NYI_assert (15, 10, 0x3F);
   6543 
   6544   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6545   if (INSTR (22, 22))
   6546     {
   6547       if (! full)
   6548 	HALT_UNALLOC;
   6549 
   6550       for (i = 0; i < 2; i++)
   6551 	aarch64_set_vec_double (cpu, vd, i,
   6552 				aarch64_get_vec_double (cpu, vn, i)
   6553 				/ aarch64_get_vec_double (cpu, vm, i));
   6554     }
   6555   else
   6556     for (i = 0; i < (full ? 4 : 2); i++)
   6557       aarch64_set_vec_float (cpu, vd, i,
   6558 			     aarch64_get_vec_float (cpu, vn, i)
   6559 			     / aarch64_get_vec_float (cpu, vm, i));
   6560 }
   6561 
   6562 static void
   6563 do_vec_FMUL (sim_cpu *cpu)
   6564 {
   6565   /* instr [31]    = 0
   6566      instr [30]    = half(0)/full(1)
   6567      instr [29,23] = 10 1110 0
   6568      instr [22]    = float(0)/double(1)
   6569      instr [21]    = 1
   6570      instr [20,16] = Vm
   6571      instr [15,10] = 1101 11
   6572      instr [9, 5]  = Vn
   6573      instr [4, 0]  = Vd.  */
   6574 
   6575   unsigned full = INSTR (30, 30);
   6576   unsigned vm = INSTR (20, 16);
   6577   unsigned vn = INSTR (9, 5);
   6578   unsigned vd = INSTR (4, 0);
   6579   unsigned i;
   6580 
   6581   NYI_assert (29, 23, 0x5C);
   6582   NYI_assert (21, 21, 1);
   6583   NYI_assert (15, 10, 0x37);
   6584 
   6585   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6586   if (INSTR (22, 22))
   6587     {
   6588       if (! full)
   6589 	HALT_UNALLOC;
   6590 
   6591       for (i = 0; i < 2; i++)
   6592 	aarch64_set_vec_double (cpu, vd, i,
   6593 				aarch64_get_vec_double (cpu, vn, i)
   6594 				* aarch64_get_vec_double (cpu, vm, i));
   6595     }
   6596   else
   6597     for (i = 0; i < (full ? 4 : 2); i++)
   6598       aarch64_set_vec_float (cpu, vd, i,
   6599 			     aarch64_get_vec_float (cpu, vn, i)
   6600 			     * aarch64_get_vec_float (cpu, vm, i));
   6601 }
   6602 
   6603 static void
   6604 do_vec_FADDP (sim_cpu *cpu)
   6605 {
   6606   /* instr [31]    = 0
   6607      instr [30]    = half(0)/full(1)
   6608      instr [29,23] = 10 1110 0
   6609      instr [22]    = float(0)/double(1)
   6610      instr [21]    = 1
   6611      instr [20,16] = Vm
   6612      instr [15,10] = 1101 01
   6613      instr [9, 5]  = Vn
   6614      instr [4, 0]  = Vd.  */
   6615 
   6616   unsigned full = INSTR (30, 30);
   6617   unsigned vm = INSTR (20, 16);
   6618   unsigned vn = INSTR (9, 5);
   6619   unsigned vd = INSTR (4, 0);
   6620 
   6621   NYI_assert (29, 23, 0x5C);
   6622   NYI_assert (21, 21, 1);
   6623   NYI_assert (15, 10, 0x35);
   6624 
   6625   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6626   if (INSTR (22, 22))
   6627     {
   6628       /* Extract values before adding them incase vd == vn/vm.  */
   6629       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
   6630       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
   6631       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
   6632       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
   6633 
   6634       if (! full)
   6635 	HALT_UNALLOC;
   6636 
   6637       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
   6638       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
   6639     }
   6640   else
   6641     {
   6642       /* Extract values before adding them incase vd == vn/vm.  */
   6643       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
   6644       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
   6645       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
   6646       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
   6647 
   6648       if (full)
   6649 	{
   6650 	  float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
   6651 	  float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
   6652 	  float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
   6653 	  float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
   6654 
   6655 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
   6656 	  aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
   6657 	  aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
   6658 	  aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
   6659 	}
   6660       else
   6661 	{
   6662 	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
   6663 	  aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
   6664 	}
   6665     }
   6666 }
   6667 
   6668 static void
   6669 do_vec_FSQRT (sim_cpu *cpu)
   6670 {
   6671   /* instr[31]    = 0
   6672      instr[30]    = half(0)/full(1)
   6673      instr[29,23] = 10 1110 1
   6674      instr[22]    = single(0)/double(1)
   6675      instr[21,10] = 10 0001 1111 10
   6676      instr[9,5]   = Vsrc
   6677      instr[4,0]   = Vdest.  */
   6678 
   6679   unsigned vn = INSTR (9, 5);
   6680   unsigned vd = INSTR (4, 0);
   6681   unsigned full = INSTR (30, 30);
   6682   int i;
   6683 
   6684   NYI_assert (29, 23, 0x5D);
   6685   NYI_assert (21, 10, 0x87E);
   6686 
   6687   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6688   if (INSTR (22, 22))
   6689     {
   6690       if (! full)
   6691 	HALT_UNALLOC;
   6692 
   6693       for (i = 0; i < 2; i++)
   6694 	aarch64_set_vec_double (cpu, vd, i,
   6695 				sqrt (aarch64_get_vec_double (cpu, vn, i)));
   6696     }
   6697   else
   6698     {
   6699       for (i = 0; i < (full ? 4 : 2); i++)
   6700 	aarch64_set_vec_float (cpu, vd, i,
   6701 			       sqrtf (aarch64_get_vec_float (cpu, vn, i)));
   6702     }
   6703 }
   6704 
   6705 static void
   6706 do_vec_FNEG (sim_cpu *cpu)
   6707 {
   6708   /* instr[31]    = 0
   6709      instr[30]    = half (0)/full (1)
   6710      instr[29,23] = 10 1110 1
   6711      instr[22]    = single (0)/double (1)
   6712      instr[21,10] = 10 0000 1111 10
   6713      instr[9,5]   = Vsrc
   6714      instr[4,0]   = Vdest.  */
   6715 
   6716   unsigned vn = INSTR (9, 5);
   6717   unsigned vd = INSTR (4, 0);
   6718   unsigned full = INSTR (30, 30);
   6719   int i;
   6720 
   6721   NYI_assert (29, 23, 0x5D);
   6722   NYI_assert (21, 10, 0x83E);
   6723 
   6724   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6725   if (INSTR (22, 22))
   6726     {
   6727       if (! full)
   6728 	HALT_UNALLOC;
   6729 
   6730       for (i = 0; i < 2; i++)
   6731 	aarch64_set_vec_double (cpu, vd, i,
   6732 				- aarch64_get_vec_double (cpu, vn, i));
   6733     }
   6734   else
   6735     {
   6736       for (i = 0; i < (full ? 4 : 2); i++)
   6737 	aarch64_set_vec_float (cpu, vd, i,
   6738 			       - aarch64_get_vec_float (cpu, vn, i));
   6739     }
   6740 }
   6741 
   6742 static void
   6743 do_vec_NOT (sim_cpu *cpu)
   6744 {
   6745   /* instr[31]    = 0
   6746      instr[30]    = half (0)/full (1)
   6747      instr[29,10] = 10 1110 0010 0000 0101 10
   6748      instr[9,5]   = Vn
   6749      instr[4.0]   = Vd.  */
   6750 
   6751   unsigned vn = INSTR (9, 5);
   6752   unsigned vd = INSTR (4, 0);
   6753   unsigned i;
   6754   int      full = INSTR (30, 30);
   6755 
   6756   NYI_assert (29, 10, 0xB8816);
   6757 
   6758   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6759   for (i = 0; i < (full ? 16 : 8); i++)
   6760     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
   6761 }
   6762 
   6763 static unsigned int
   6764 clz (uint64_t val, unsigned size)
   6765 {
   6766   uint64_t mask = 1;
   6767   int      count;
   6768 
   6769   mask <<= (size - 1);
   6770   count = 0;
   6771   do
   6772     {
   6773       if (val & mask)
   6774 	break;
   6775       mask >>= 1;
   6776       count ++;
   6777     }
   6778   while (mask);
   6779 
   6780   return count;
   6781 }
   6782 
   6783 static void
   6784 do_vec_CLZ (sim_cpu *cpu)
   6785 {
   6786   /* instr[31]    = 0
   6787      instr[30]    = half (0)/full (1)
   6788      instr[29,24] = 10 1110
   6789      instr[23,22] = size
   6790      instr[21,10] = 10 0000 0100 10
   6791      instr[9,5]   = Vn
   6792      instr[4.0]   = Vd.  */
   6793 
   6794   unsigned vn = INSTR (9, 5);
   6795   unsigned vd = INSTR (4, 0);
   6796   unsigned i;
   6797   int      full = INSTR (30,30);
   6798 
   6799   NYI_assert (29, 24, 0x2E);
   6800   NYI_assert (21, 10, 0x812);
   6801 
   6802   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6803   switch (INSTR (23, 22))
   6804     {
   6805     case 0:
   6806       for (i = 0; i < (full ? 16 : 8); i++)
   6807 	aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
   6808       break;
   6809     case 1:
   6810       for (i = 0; i < (full ? 8 : 4); i++)
   6811 	aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
   6812       break;
   6813     case 2:
   6814       for (i = 0; i < (full ? 4 : 2); i++)
   6815 	aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
   6816       break;
   6817     case 3:
   6818       if (! full)
   6819 	HALT_UNALLOC;
   6820       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
   6821       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
   6822       break;
   6823     }
   6824 }
   6825 
   6826 static void
   6827 do_vec_MOV_element (sim_cpu *cpu)
   6828 {
   6829   /* instr[31,21] = 0110 1110 000
   6830      instr[20,16] = size & dest index
   6831      instr[15]    = 0
   6832      instr[14,11] = source index
   6833      instr[10]    = 1
   6834      instr[9,5]   = Vs
   6835      instr[4.0]   = Vd.  */
   6836 
   6837   unsigned vs = INSTR (9, 5);
   6838   unsigned vd = INSTR (4, 0);
   6839   unsigned src_index;
   6840   unsigned dst_index;
   6841 
   6842   NYI_assert (31, 21, 0x370);
   6843   NYI_assert (15, 15, 0);
   6844   NYI_assert (10, 10, 1);
   6845 
   6846   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6847   if (INSTR (16, 16))
   6848     {
   6849       /* Move a byte.  */
   6850       src_index = INSTR (14, 11);
   6851       dst_index = INSTR (20, 17);
   6852       aarch64_set_vec_u8 (cpu, vd, dst_index,
   6853 			  aarch64_get_vec_u8 (cpu, vs, src_index));
   6854     }
   6855   else if (INSTR (17, 17))
   6856     {
   6857       /* Move 16-bits.  */
   6858       NYI_assert (11, 11, 0);
   6859       src_index = INSTR (14, 12);
   6860       dst_index = INSTR (20, 18);
   6861       aarch64_set_vec_u16 (cpu, vd, dst_index,
   6862 			   aarch64_get_vec_u16 (cpu, vs, src_index));
   6863     }
   6864   else if (INSTR (18, 18))
   6865     {
   6866       /* Move 32-bits.  */
   6867       NYI_assert (12, 11, 0);
   6868       src_index = INSTR (14, 13);
   6869       dst_index = INSTR (20, 19);
   6870       aarch64_set_vec_u32 (cpu, vd, dst_index,
   6871 			   aarch64_get_vec_u32 (cpu, vs, src_index));
   6872     }
   6873   else
   6874     {
   6875       NYI_assert (19, 19, 1);
   6876       NYI_assert (13, 11, 0);
   6877       src_index = INSTR (14, 14);
   6878       dst_index = INSTR (20, 20);
   6879       aarch64_set_vec_u64 (cpu, vd, dst_index,
   6880 			   aarch64_get_vec_u64 (cpu, vs, src_index));
   6881     }
   6882 }
   6883 
   6884 static void
   6885 do_vec_REV32 (sim_cpu *cpu)
   6886 {
   6887   /* instr[31]    = 0
   6888      instr[30]    = full/half
   6889      instr[29,24] = 10 1110
   6890      instr[23,22] = size
   6891      instr[21,10] = 10 0000 0000 10
   6892      instr[9,5]   = Rn
   6893      instr[4,0]   = Rd.  */
   6894 
   6895   unsigned rn = INSTR (9, 5);
   6896   unsigned rd = INSTR (4, 0);
   6897   unsigned size = INSTR (23, 22);
   6898   unsigned full = INSTR (30, 30);
   6899   unsigned i;
   6900   FRegister val;
   6901 
   6902   NYI_assert (29, 24, 0x2E);
   6903   NYI_assert (21, 10, 0x802);
   6904 
   6905   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6906   switch (size)
   6907     {
   6908     case 0:
   6909       for (i = 0; i < (full ? 16 : 8); i++)
   6910 	val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
   6911       break;
   6912 
   6913     case 1:
   6914       for (i = 0; i < (full ? 8 : 4); i++)
   6915 	val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
   6916       break;
   6917 
   6918     default:
   6919       HALT_UNALLOC;
   6920     }
   6921 
   6922   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
   6923   if (full)
   6924     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
   6925 }
   6926 
   6927 static void
   6928 do_vec_EXT (sim_cpu *cpu)
   6929 {
   6930   /* instr[31]    = 0
   6931      instr[30]    = full/half
   6932      instr[29,21] = 10 1110 000
   6933      instr[20,16] = Vm
   6934      instr[15]    = 0
   6935      instr[14,11] = source index
   6936      instr[10]    = 0
   6937      instr[9,5]   = Vn
   6938      instr[4.0]   = Vd.  */
   6939 
   6940   unsigned vm = INSTR (20, 16);
   6941   unsigned vn = INSTR (9, 5);
   6942   unsigned vd = INSTR (4, 0);
   6943   unsigned src_index = INSTR (14, 11);
   6944   unsigned full = INSTR (30, 30);
   6945   unsigned i;
   6946   unsigned j;
   6947   FRegister val;
   6948 
   6949   NYI_assert (31, 21, 0x370);
   6950   NYI_assert (15, 15, 0);
   6951   NYI_assert (10, 10, 0);
   6952 
   6953   if (!full && (src_index & 0x8))
   6954     HALT_UNALLOC;
   6955 
   6956   j = 0;
   6957 
   6958   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   6959   for (i = src_index; i < (full ? 16 : 8); i++)
   6960     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
   6961   for (i = 0; i < src_index; i++)
   6962     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
   6963 
   6964   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
   6965   if (full)
   6966     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
   6967 }
   6968 
   6969 static void
   6970 dexAdvSIMD0 (sim_cpu *cpu)
   6971 {
   6972   /* instr [28,25] = 0 111.  */
   6973   if (    INSTR (15, 10) == 0x07
   6974       && (INSTR (9, 5) ==
   6975 	  INSTR (20, 16)))
   6976     {
   6977       if (INSTR (31, 21) == 0x075
   6978 	  || INSTR (31, 21) == 0x275)
   6979 	{
   6980 	  do_vec_MOV_whole_vector (cpu);
   6981 	  return;
   6982 	}
   6983     }
   6984 
   6985   if (INSTR (29, 19) == 0x1E0)
   6986     {
   6987       do_vec_MOV_immediate (cpu);
   6988       return;
   6989     }
   6990 
   6991   if (INSTR (29, 19) == 0x5E0)
   6992     {
   6993       do_vec_MVNI (cpu);
   6994       return;
   6995     }
   6996 
   6997   if (INSTR (29, 19) == 0x1C0
   6998       || INSTR (29, 19) == 0x1C1)
   6999     {
   7000       if (INSTR (15, 10) == 0x03)
   7001 	{
   7002 	  do_vec_DUP_scalar_into_vector (cpu);
   7003 	  return;
   7004 	}
   7005     }
   7006 
   7007   switch (INSTR (29, 24))
   7008     {
   7009     case 0x0E: do_vec_op1 (cpu); return;
   7010     case 0x0F: do_vec_op2 (cpu); return;
   7011 
   7012     case 0x2E:
   7013       if (INSTR (21, 21) == 1)
   7014 	{
   7015 	  switch (INSTR (15, 10))
   7016 	    {
   7017 	    case 0x02:
   7018 	      do_vec_REV32 (cpu);
   7019 	      return;
   7020 
   7021 	    case 0x07:
   7022 	      switch (INSTR (23, 22))
   7023 		{
   7024 		case 0: do_vec_EOR (cpu); return;
   7025 		case 1: do_vec_BSL (cpu); return;
   7026 		case 2:
   7027 		case 3: do_vec_bit (cpu); return;
   7028 		}
   7029 	      break;
   7030 
   7031 	    case 0x08: do_vec_sub_long (cpu); return;
   7032 	    case 0x11: do_vec_USHL (cpu); return;
   7033 	    case 0x12: do_vec_CLZ (cpu); return;
   7034 	    case 0x16: do_vec_NOT (cpu); return;
   7035 	    case 0x19: do_vec_max (cpu); return;
   7036 	    case 0x1B: do_vec_min (cpu); return;
   7037 	    case 0x21: do_vec_SUB (cpu); return;
   7038 	    case 0x25: do_vec_MLS (cpu); return;
   7039 	    case 0x31: do_vec_FminmaxNMP (cpu); return;
   7040 	    case 0x35: do_vec_FADDP (cpu); return;
   7041 	    case 0x37: do_vec_FMUL (cpu); return;
   7042 	    case 0x3F: do_vec_FDIV (cpu); return;
   7043 
   7044 	    case 0x3E:
   7045 	      switch (INSTR (20, 16))
   7046 		{
   7047 		case 0x00: do_vec_FNEG (cpu); return;
   7048 		case 0x01: do_vec_FSQRT (cpu); return;
   7049 		default:   HALT_NYI;
   7050 		}
   7051 
   7052 	    case 0x0D:
   7053 	    case 0x0F:
   7054 	    case 0x22:
   7055 	    case 0x23:
   7056 	    case 0x26:
   7057 	    case 0x2A:
   7058 	    case 0x32:
   7059 	    case 0x36:
   7060 	    case 0x39:
   7061 	    case 0x3A:
   7062 	      do_vec_compare (cpu); return;
   7063 
   7064 	    default:
   7065 	      break;
   7066 	    }
   7067 	}
   7068 
   7069       if (INSTR (31, 21) == 0x370)
   7070 	{
   7071 	  if (INSTR (10, 10))
   7072 	    do_vec_MOV_element (cpu);
   7073 	  else
   7074 	    do_vec_EXT (cpu);
   7075 	  return;
   7076 	}
   7077 
   7078       switch (INSTR (21, 10))
   7079 	{
   7080 	case 0x82E: do_vec_neg (cpu); return;
   7081 	case 0x87E: do_vec_sqrt (cpu); return;
   7082 	default:
   7083 	  if (INSTR (15, 10) == 0x30)
   7084 	    {
   7085 	      do_vec_mull (cpu);
   7086 	      return;
   7087 	    }
   7088 	  break;
   7089 	}
   7090       break;
   7091 
   7092     case 0x2f:
   7093       switch (INSTR (15, 10))
   7094 	{
   7095 	case 0x01: do_vec_SSHR_USHR (cpu); return;
   7096 	case 0x10:
   7097 	case 0x12: do_vec_mls_indexed (cpu); return;
   7098 	case 0x29: do_vec_xtl (cpu); return;
   7099 	default:
   7100 	  HALT_NYI;
   7101 	}
   7102 
   7103     default:
   7104       break;
   7105     }
   7106 
   7107   HALT_NYI;
   7108 }
   7109 
   7110 /* 3 sources.  */
   7111 
   7112 /* Float multiply add.  */
   7113 static void
   7114 fmadds (sim_cpu *cpu)
   7115 {
   7116   unsigned sa = INSTR (14, 10);
   7117   unsigned sm = INSTR (20, 16);
   7118   unsigned sn = INSTR ( 9,  5);
   7119   unsigned sd = INSTR ( 4,  0);
   7120 
   7121   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7122   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
   7123 			+ aarch64_get_FP_float (cpu, sn)
   7124 			* aarch64_get_FP_float (cpu, sm));
   7125 }
   7126 
   7127 /* Double multiply add.  */
   7128 static void
   7129 fmaddd (sim_cpu *cpu)
   7130 {
   7131   unsigned sa = INSTR (14, 10);
   7132   unsigned sm = INSTR (20, 16);
   7133   unsigned sn = INSTR ( 9,  5);
   7134   unsigned sd = INSTR ( 4,  0);
   7135 
   7136   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7137   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
   7138 			 + aarch64_get_FP_double (cpu, sn)
   7139 			 * aarch64_get_FP_double (cpu, sm));
   7140 }
   7141 
   7142 /* Float multiply subtract.  */
   7143 static void
   7144 fmsubs (sim_cpu *cpu)
   7145 {
   7146   unsigned sa = INSTR (14, 10);
   7147   unsigned sm = INSTR (20, 16);
   7148   unsigned sn = INSTR ( 9,  5);
   7149   unsigned sd = INSTR ( 4,  0);
   7150 
   7151   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7152   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
   7153 			- aarch64_get_FP_float (cpu, sn)
   7154 			* aarch64_get_FP_float (cpu, sm));
   7155 }
   7156 
   7157 /* Double multiply subtract.  */
   7158 static void
   7159 fmsubd (sim_cpu *cpu)
   7160 {
   7161   unsigned sa = INSTR (14, 10);
   7162   unsigned sm = INSTR (20, 16);
   7163   unsigned sn = INSTR ( 9,  5);
   7164   unsigned sd = INSTR ( 4,  0);
   7165 
   7166   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7167   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
   7168 			 - aarch64_get_FP_double (cpu, sn)
   7169 			 * aarch64_get_FP_double (cpu, sm));
   7170 }
   7171 
   7172 /* Float negative multiply add.  */
   7173 static void
   7174 fnmadds (sim_cpu *cpu)
   7175 {
   7176   unsigned sa = INSTR (14, 10);
   7177   unsigned sm = INSTR (20, 16);
   7178   unsigned sn = INSTR ( 9,  5);
   7179   unsigned sd = INSTR ( 4,  0);
   7180 
   7181   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7182   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
   7183 			+ (- aarch64_get_FP_float (cpu, sn))
   7184 			* aarch64_get_FP_float (cpu, sm));
   7185 }
   7186 
   7187 /* Double negative multiply add.  */
   7188 static void
   7189 fnmaddd (sim_cpu *cpu)
   7190 {
   7191   unsigned sa = INSTR (14, 10);
   7192   unsigned sm = INSTR (20, 16);
   7193   unsigned sn = INSTR ( 9,  5);
   7194   unsigned sd = INSTR ( 4,  0);
   7195 
   7196   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7197   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
   7198 			 + (- aarch64_get_FP_double (cpu, sn))
   7199 			 * aarch64_get_FP_double (cpu, sm));
   7200 }
   7201 
   7202 /* Float negative multiply subtract.  */
   7203 static void
   7204 fnmsubs (sim_cpu *cpu)
   7205 {
   7206   unsigned sa = INSTR (14, 10);
   7207   unsigned sm = INSTR (20, 16);
   7208   unsigned sn = INSTR ( 9,  5);
   7209   unsigned sd = INSTR ( 4,  0);
   7210 
   7211   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7212   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
   7213 			+ aarch64_get_FP_float (cpu, sn)
   7214 			* aarch64_get_FP_float (cpu, sm));
   7215 }
   7216 
   7217 /* Double negative multiply subtract.  */
   7218 static void
   7219 fnmsubd (sim_cpu *cpu)
   7220 {
   7221   unsigned sa = INSTR (14, 10);
   7222   unsigned sm = INSTR (20, 16);
   7223   unsigned sn = INSTR ( 9,  5);
   7224   unsigned sd = INSTR ( 4,  0);
   7225 
   7226   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7227   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
   7228 			 + aarch64_get_FP_double (cpu, sn)
   7229 			 * aarch64_get_FP_double (cpu, sm));
   7230 }
   7231 
   7232 static void
   7233 dexSimpleFPDataProc3Source (sim_cpu *cpu)
   7234 {
   7235   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
   7236      instr[30]    = 0
   7237      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
   7238      instr[28,25] = 1111
   7239      instr[24]    = 1
   7240      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
   7241      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
   7242      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
   7243 
   7244   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
   7245   /* dispatch on combined type:o1:o2.  */
   7246   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
   7247 
   7248   if (M_S != 0)
   7249     HALT_UNALLOC;
   7250 
   7251   switch (dispatch)
   7252     {
   7253     case 0: fmadds (cpu); return;
   7254     case 1: fmsubs (cpu); return;
   7255     case 2: fnmadds (cpu); return;
   7256     case 3: fnmsubs (cpu); return;
   7257     case 4: fmaddd (cpu); return;
   7258     case 5: fmsubd (cpu); return;
   7259     case 6: fnmaddd (cpu); return;
   7260     case 7: fnmsubd (cpu); return;
   7261     default:
   7262       /* type > 1 is currently unallocated.  */
   7263       HALT_UNALLOC;
   7264     }
   7265 }
   7266 
   7267 static void
   7268 dexSimpleFPFixedConvert (sim_cpu *cpu)
   7269 {
   7270   HALT_NYI;
   7271 }
   7272 
   7273 static void
   7274 dexSimpleFPCondCompare (sim_cpu *cpu)
   7275 {
   7276   /* instr [31,23] = 0001 1110 0
   7277      instr [22]    = type
   7278      instr [21]    = 1
   7279      instr [20,16] = Rm
   7280      instr [15,12] = condition
   7281      instr [11,10] = 01
   7282      instr [9,5]   = Rn
   7283      instr [4]     = 0
   7284      instr [3,0]   = nzcv  */
   7285 
   7286   unsigned rm = INSTR (20, 16);
   7287   unsigned rn = INSTR (9, 5);
   7288 
   7289   NYI_assert (31, 23, 0x3C);
   7290   NYI_assert (11, 10, 0x1);
   7291   NYI_assert (4,  4,  0);
   7292 
   7293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7294   if (! testConditionCode (cpu, INSTR (15, 12)))
   7295     {
   7296       aarch64_set_CPSR (cpu, INSTR (3, 0));
   7297       return;
   7298     }
   7299 
   7300   if (INSTR (22, 22))
   7301     {
   7302       /* Double precision.  */
   7303       double val1 = aarch64_get_vec_double (cpu, rn, 0);
   7304       double val2 = aarch64_get_vec_double (cpu, rm, 0);
   7305 
   7306       /* FIXME: Check for NaNs.  */
   7307       if (val1 == val2)
   7308 	aarch64_set_CPSR (cpu, (Z | C));
   7309       else if (val1 < val2)
   7310 	aarch64_set_CPSR (cpu, N);
   7311       else /* val1 > val2 */
   7312 	aarch64_set_CPSR (cpu, C);
   7313     }
   7314   else
   7315     {
   7316       /* Single precision.  */
   7317       float val1 = aarch64_get_vec_float (cpu, rn, 0);
   7318       float val2 = aarch64_get_vec_float (cpu, rm, 0);
   7319 
   7320       /* FIXME: Check for NaNs.  */
   7321       if (val1 == val2)
   7322 	aarch64_set_CPSR (cpu, (Z | C));
   7323       else if (val1 < val2)
   7324 	aarch64_set_CPSR (cpu, N);
   7325       else /* val1 > val2 */
   7326 	aarch64_set_CPSR (cpu, C);
   7327     }
   7328 }
   7329 
   7330 /* 2 sources.  */
   7331 
   7332 /* Float add.  */
   7333 static void
   7334 fadds (sim_cpu *cpu)
   7335 {
   7336   unsigned sm = INSTR (20, 16);
   7337   unsigned sn = INSTR ( 9,  5);
   7338   unsigned sd = INSTR ( 4,  0);
   7339 
   7340   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7341   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
   7342 			+ aarch64_get_FP_float (cpu, sm));
   7343 }
   7344 
   7345 /* Double add.  */
   7346 static void
   7347 faddd (sim_cpu *cpu)
   7348 {
   7349   unsigned sm = INSTR (20, 16);
   7350   unsigned sn = INSTR ( 9,  5);
   7351   unsigned sd = INSTR ( 4,  0);
   7352 
   7353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7354   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
   7355 			 + aarch64_get_FP_double (cpu, sm));
   7356 }
   7357 
   7358 /* Float divide.  */
   7359 static void
   7360 fdivs (sim_cpu *cpu)
   7361 {
   7362   unsigned sm = INSTR (20, 16);
   7363   unsigned sn = INSTR ( 9,  5);
   7364   unsigned sd = INSTR ( 4,  0);
   7365 
   7366   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7367   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
   7368 			/ aarch64_get_FP_float (cpu, sm));
   7369 }
   7370 
   7371 /* Double divide.  */
   7372 static void
   7373 fdivd (sim_cpu *cpu)
   7374 {
   7375   unsigned sm = INSTR (20, 16);
   7376   unsigned sn = INSTR ( 9,  5);
   7377   unsigned sd = INSTR ( 4,  0);
   7378 
   7379   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7380   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
   7381 			 / aarch64_get_FP_double (cpu, sm));
   7382 }
   7383 
   7384 /* Float multiply.  */
   7385 static void
   7386 fmuls (sim_cpu *cpu)
   7387 {
   7388   unsigned sm = INSTR (20, 16);
   7389   unsigned sn = INSTR ( 9,  5);
   7390   unsigned sd = INSTR ( 4,  0);
   7391 
   7392   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7393   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
   7394 			* aarch64_get_FP_float (cpu, sm));
   7395 }
   7396 
   7397 /* Double multiply.  */
   7398 static void
   7399 fmuld (sim_cpu *cpu)
   7400 {
   7401   unsigned sm = INSTR (20, 16);
   7402   unsigned sn = INSTR ( 9,  5);
   7403   unsigned sd = INSTR ( 4,  0);
   7404 
   7405   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7406   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
   7407 			 * aarch64_get_FP_double (cpu, sm));
   7408 }
   7409 
   7410 /* Float negate and multiply.  */
   7411 static void
   7412 fnmuls (sim_cpu *cpu)
   7413 {
   7414   unsigned sm = INSTR (20, 16);
   7415   unsigned sn = INSTR ( 9,  5);
   7416   unsigned sd = INSTR ( 4,  0);
   7417 
   7418   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7419   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
   7420 				    * aarch64_get_FP_float (cpu, sm)));
   7421 }
   7422 
   7423 /* Double negate and multiply.  */
   7424 static void
   7425 fnmuld (sim_cpu *cpu)
   7426 {
   7427   unsigned sm = INSTR (20, 16);
   7428   unsigned sn = INSTR ( 9,  5);
   7429   unsigned sd = INSTR ( 4,  0);
   7430 
   7431   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7432   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
   7433 				     * aarch64_get_FP_double (cpu, sm)));
   7434 }
   7435 
   7436 /* Float subtract.  */
   7437 static void
   7438 fsubs (sim_cpu *cpu)
   7439 {
   7440   unsigned sm = INSTR (20, 16);
   7441   unsigned sn = INSTR ( 9,  5);
   7442   unsigned sd = INSTR ( 4,  0);
   7443 
   7444   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7445   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
   7446 			- aarch64_get_FP_float (cpu, sm));
   7447 }
   7448 
   7449 /* Double subtract.  */
   7450 static void
   7451 fsubd (sim_cpu *cpu)
   7452 {
   7453   unsigned sm = INSTR (20, 16);
   7454   unsigned sn = INSTR ( 9,  5);
   7455   unsigned sd = INSTR ( 4,  0);
   7456 
   7457   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7458   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
   7459 			 - aarch64_get_FP_double (cpu, sm));
   7460 }
   7461 
   7462 static void
   7463 do_FMINNM (sim_cpu *cpu)
   7464 {
   7465   /* instr[31,23] = 0 0011 1100
   7466      instr[22]    = float(0)/double(1)
   7467      instr[21]    = 1
   7468      instr[20,16] = Sm
   7469      instr[15,10] = 01 1110
   7470      instr[9,5]   = Sn
   7471      instr[4,0]   = Cpu  */
   7472 
   7473   unsigned sm = INSTR (20, 16);
   7474   unsigned sn = INSTR ( 9,  5);
   7475   unsigned sd = INSTR ( 4,  0);
   7476 
   7477   NYI_assert (31, 23, 0x03C);
   7478   NYI_assert (15, 10, 0x1E);
   7479 
   7480   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7481   if (INSTR (22, 22))
   7482     aarch64_set_FP_double (cpu, sd,
   7483 			   dminnm (aarch64_get_FP_double (cpu, sn),
   7484 				   aarch64_get_FP_double (cpu, sm)));
   7485   else
   7486     aarch64_set_FP_float (cpu, sd,
   7487 			  fminnm (aarch64_get_FP_float (cpu, sn),
   7488 				  aarch64_get_FP_float (cpu, sm)));
   7489 }
   7490 
   7491 static void
   7492 do_FMAXNM (sim_cpu *cpu)
   7493 {
   7494   /* instr[31,23] = 0 0011 1100
   7495      instr[22]    = float(0)/double(1)
   7496      instr[21]    = 1
   7497      instr[20,16] = Sm
   7498      instr[15,10] = 01 1010
   7499      instr[9,5]   = Sn
   7500      instr[4,0]   = Cpu  */
   7501 
   7502   unsigned sm = INSTR (20, 16);
   7503   unsigned sn = INSTR ( 9,  5);
   7504   unsigned sd = INSTR ( 4,  0);
   7505 
   7506   NYI_assert (31, 23, 0x03C);
   7507   NYI_assert (15, 10, 0x1A);
   7508 
   7509   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7510   if (INSTR (22, 22))
   7511     aarch64_set_FP_double (cpu, sd,
   7512 			   dmaxnm (aarch64_get_FP_double (cpu, sn),
   7513 				   aarch64_get_FP_double (cpu, sm)));
   7514   else
   7515     aarch64_set_FP_float (cpu, sd,
   7516 			  fmaxnm (aarch64_get_FP_float (cpu, sn),
   7517 				  aarch64_get_FP_float (cpu, sm)));
   7518 }
   7519 
   7520 static void
   7521 dexSimpleFPDataProc2Source (sim_cpu *cpu)
   7522 {
   7523   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
   7524      instr[30]    = 0
   7525      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
   7526      instr[28,25] = 1111
   7527      instr[24]    = 0
   7528      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
   7529      instr[21]    = 1
   7530      instr[20,16] = Vm
   7531      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
   7532                                0010 ==> FADD, 0011 ==> FSUB,
   7533                                0100 ==> FMAX, 0101 ==> FMIN
   7534                                0110 ==> FMAXNM, 0111 ==> FMINNM
   7535                                1000 ==> FNMUL, ow ==> UNALLOC
   7536      instr[11,10] = 10
   7537      instr[9,5]   = Vn
   7538      instr[4,0]   = Vd  */
   7539 
   7540   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
   7541   uint32_t type = INSTR (23, 22);
   7542   /* Dispatch on opcode.  */
   7543   uint32_t dispatch = INSTR (15, 12);
   7544 
   7545   if (type > 1)
   7546     HALT_UNALLOC;
   7547 
   7548   if (M_S != 0)
   7549     HALT_UNALLOC;
   7550 
   7551   if (type)
   7552     switch (dispatch)
   7553       {
   7554       case 0: fmuld (cpu); return;
   7555       case 1: fdivd (cpu); return;
   7556       case 2: faddd (cpu); return;
   7557       case 3: fsubd (cpu); return;
   7558       case 6: do_FMAXNM (cpu); return;
   7559       case 7: do_FMINNM (cpu); return;
   7560       case 8: fnmuld (cpu); return;
   7561 
   7562 	/* Have not yet implemented fmax and fmin.  */
   7563       case 4:
   7564       case 5:
   7565 	HALT_NYI;
   7566 
   7567       default:
   7568 	HALT_UNALLOC;
   7569       }
   7570   else /* type == 0 => floats.  */
   7571     switch (dispatch)
   7572       {
   7573       case 0: fmuls (cpu); return;
   7574       case 1: fdivs (cpu); return;
   7575       case 2: fadds (cpu); return;
   7576       case 3: fsubs (cpu); return;
   7577       case 6: do_FMAXNM (cpu); return;
   7578       case 7: do_FMINNM (cpu); return;
   7579       case 8: fnmuls (cpu); return;
   7580 
   7581       case 4:
   7582       case 5:
   7583 	HALT_NYI;
   7584 
   7585       default:
   7586 	HALT_UNALLOC;
   7587       }
   7588 }
   7589 
   7590 static void
   7591 dexSimpleFPCondSelect (sim_cpu *cpu)
   7592 {
   7593   /* FCSEL
   7594      instr[31,23] = 0 0011 1100
   7595      instr[22]    = 0=>single 1=>double
   7596      instr[21]    = 1
   7597      instr[20,16] = Sm
   7598      instr[15,12] = cond
   7599      instr[11,10] = 11
   7600      instr[9,5]   = Sn
   7601      instr[4,0]   = Cpu  */
   7602   unsigned sm = INSTR (20, 16);
   7603   unsigned sn = INSTR ( 9, 5);
   7604   unsigned sd = INSTR ( 4, 0);
   7605   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
   7606 
   7607   NYI_assert (31, 23, 0x03C);
   7608   NYI_assert (11, 10, 0x3);
   7609 
   7610   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7611   if (INSTR (22, 22))
   7612     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
   7613 				     : aarch64_get_FP_double (cpu, sm)));
   7614   else
   7615     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
   7616 				    : aarch64_get_FP_float (cpu, sm)));
   7617 }
   7618 
   7619 /* Store 32 bit unscaled signed 9 bit.  */
   7620 static void
   7621 fsturs (sim_cpu *cpu, int32_t offset)
   7622 {
   7623   unsigned int rn = INSTR (9, 5);
   7624   unsigned int st = INSTR (4, 0);
   7625 
   7626   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7627   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
   7628 		       aarch64_get_vec_u32 (cpu, st, 0));
   7629 }
   7630 
   7631 /* Store 64 bit unscaled signed 9 bit.  */
   7632 static void
   7633 fsturd (sim_cpu *cpu, int32_t offset)
   7634 {
   7635   unsigned int rn = INSTR (9, 5);
   7636   unsigned int st = INSTR (4, 0);
   7637 
   7638   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7639   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
   7640 		       aarch64_get_vec_u64 (cpu, st, 0));
   7641 }
   7642 
   7643 /* Store 128 bit unscaled signed 9 bit.  */
   7644 static void
   7645 fsturq (sim_cpu *cpu, int32_t offset)
   7646 {
   7647   unsigned int rn = INSTR (9, 5);
   7648   unsigned int st = INSTR (4, 0);
   7649   FRegister a;
   7650 
   7651   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7652   aarch64_get_FP_long_double (cpu, st, & a);
   7653   aarch64_set_mem_long_double (cpu,
   7654 			       aarch64_get_reg_u64 (cpu, rn, 1)
   7655 			       + offset, a);
   7656 }
   7657 
   7658 /* TODO FP move register.  */
   7659 
   7660 /* 32 bit fp to fp move register.  */
   7661 static void
   7662 ffmovs (sim_cpu *cpu)
   7663 {
   7664   unsigned int rn = INSTR (9, 5);
   7665   unsigned int st = INSTR (4, 0);
   7666 
   7667   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7668   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
   7669 }
   7670 
   7671 /* 64 bit fp to fp move register.  */
   7672 static void
   7673 ffmovd (sim_cpu *cpu)
   7674 {
   7675   unsigned int rn = INSTR (9, 5);
   7676   unsigned int st = INSTR (4, 0);
   7677 
   7678   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7679   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
   7680 }
   7681 
   7682 /* 32 bit GReg to Vec move register.  */
   7683 static void
   7684 fgmovs (sim_cpu *cpu)
   7685 {
   7686   unsigned int rn = INSTR (9, 5);
   7687   unsigned int st = INSTR (4, 0);
   7688 
   7689   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7690   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
   7691 }
   7692 
   7693 /* 64 bit g to fp move register.  */
   7694 static void
   7695 fgmovd (sim_cpu *cpu)
   7696 {
   7697   unsigned int rn = INSTR (9, 5);
   7698   unsigned int st = INSTR (4, 0);
   7699 
   7700   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7701   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
   7702 }
   7703 
   7704 /* 32 bit fp to g move register.  */
   7705 static void
   7706 gfmovs (sim_cpu *cpu)
   7707 {
   7708   unsigned int rn = INSTR (9, 5);
   7709   unsigned int st = INSTR (4, 0);
   7710 
   7711   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7712   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
   7713 }
   7714 
   7715 /* 64 bit fp to g move register.  */
   7716 static void
   7717 gfmovd (sim_cpu *cpu)
   7718 {
   7719   unsigned int rn = INSTR (9, 5);
   7720   unsigned int st = INSTR (4, 0);
   7721 
   7722   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7723   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
   7724 }
   7725 
   7726 /* FP move immediate
   7727 
   7728    These install an immediate 8 bit value in the target register
   7729    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
   7730    bit exponent.  */
   7731 
   7732 static void
   7733 fmovs (sim_cpu *cpu)
   7734 {
   7735   unsigned int sd = INSTR (4, 0);
   7736   uint32_t imm = INSTR (20, 13);
   7737   float f = fp_immediate_for_encoding_32 (imm);
   7738 
   7739   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7740   aarch64_set_FP_float (cpu, sd, f);
   7741 }
   7742 
   7743 static void
   7744 fmovd (sim_cpu *cpu)
   7745 {
   7746   unsigned int sd = INSTR (4, 0);
   7747   uint32_t imm = INSTR (20, 13);
   7748   double d = fp_immediate_for_encoding_64 (imm);
   7749 
   7750   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7751   aarch64_set_FP_double (cpu, sd, d);
   7752 }
   7753 
   7754 static void
   7755 dexSimpleFPImmediate (sim_cpu *cpu)
   7756 {
   7757   /* instr[31,23] == 00111100
   7758      instr[22]    == type : single(0)/double(1)
   7759      instr[21]    == 1
   7760      instr[20,13] == imm8
   7761      instr[12,10] == 100
   7762      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
   7763      instr[4,0]   == Rd  */
   7764   uint32_t imm5 = INSTR (9, 5);
   7765 
   7766   NYI_assert (31, 23, 0x3C);
   7767 
   7768   if (imm5 != 0)
   7769     HALT_UNALLOC;
   7770 
   7771   if (INSTR (22, 22))
   7772     fmovd (cpu);
   7773   else
   7774     fmovs (cpu);
   7775 }
   7776 
   7777 /* TODO specific decode and execute for group Load Store.  */
   7778 
   7779 /* TODO FP load/store single register (unscaled offset).  */
   7780 
   7781 /* TODO load 8 bit unscaled signed 9 bit.  */
   7782 /* TODO load 16 bit unscaled signed 9 bit.  */
   7783 
   7784 /* Load 32 bit unscaled signed 9 bit.  */
   7785 static void
   7786 fldurs (sim_cpu *cpu, int32_t offset)
   7787 {
   7788   unsigned int rn = INSTR (9, 5);
   7789   unsigned int st = INSTR (4, 0);
   7790 
   7791   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7792   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
   7793 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
   7794 }
   7795 
   7796 /* Load 64 bit unscaled signed 9 bit.  */
   7797 static void
   7798 fldurd (sim_cpu *cpu, int32_t offset)
   7799 {
   7800   unsigned int rn = INSTR (9, 5);
   7801   unsigned int st = INSTR (4, 0);
   7802 
   7803   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7804   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
   7805 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
   7806 }
   7807 
   7808 /* Load 128 bit unscaled signed 9 bit.  */
   7809 static void
   7810 fldurq (sim_cpu *cpu, int32_t offset)
   7811 {
   7812   unsigned int rn = INSTR (9, 5);
   7813   unsigned int st = INSTR (4, 0);
   7814   FRegister a;
   7815   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
   7816 
   7817   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7818   aarch64_get_mem_long_double (cpu, addr, & a);
   7819   aarch64_set_FP_long_double (cpu, st, a);
   7820 }
   7821 
   7822 /* TODO store 8 bit unscaled signed 9 bit.  */
   7823 /* TODO store 16 bit unscaled signed 9 bit.  */
   7824 
   7825 
   7826 /* 1 source.  */
   7827 
   7828 /* Float absolute value.  */
   7829 static void
   7830 fabss (sim_cpu *cpu)
   7831 {
   7832   unsigned sn = INSTR (9, 5);
   7833   unsigned sd = INSTR (4, 0);
   7834   float value = aarch64_get_FP_float (cpu, sn);
   7835 
   7836   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7837   aarch64_set_FP_float (cpu, sd, fabsf (value));
   7838 }
   7839 
   7840 /* Double absolute value.  */
   7841 static void
   7842 fabcpu (sim_cpu *cpu)
   7843 {
   7844   unsigned sn = INSTR (9, 5);
   7845   unsigned sd = INSTR (4, 0);
   7846   double value = aarch64_get_FP_double (cpu, sn);
   7847 
   7848   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7849   aarch64_set_FP_double (cpu, sd, fabs (value));
   7850 }
   7851 
   7852 /* Float negative value.  */
   7853 static void
   7854 fnegs (sim_cpu *cpu)
   7855 {
   7856   unsigned sn = INSTR (9, 5);
   7857   unsigned sd = INSTR (4, 0);
   7858 
   7859   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7860   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
   7861 }
   7862 
   7863 /* Double negative value.  */
   7864 static void
   7865 fnegd (sim_cpu *cpu)
   7866 {
   7867   unsigned sn = INSTR (9, 5);
   7868   unsigned sd = INSTR (4, 0);
   7869 
   7870   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7871   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
   7872 }
   7873 
   7874 /* Float square root.  */
   7875 static void
   7876 fsqrts (sim_cpu *cpu)
   7877 {
   7878   unsigned sn = INSTR (9, 5);
   7879   unsigned sd = INSTR (4, 0);
   7880 
   7881   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7882   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
   7883 }
   7884 
   7885 /* Double square root.  */
   7886 static void
   7887 fsqrtd (sim_cpu *cpu)
   7888 {
   7889   unsigned sn = INSTR (9, 5);
   7890   unsigned sd = INSTR (4, 0);
   7891 
   7892   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7893   aarch64_set_FP_double (cpu, sd,
   7894 			 sqrt (aarch64_get_FP_double (cpu, sn)));
   7895 }
   7896 
   7897 /* Convert double to float.  */
   7898 static void
   7899 fcvtds (sim_cpu *cpu)
   7900 {
   7901   unsigned sn = INSTR (9, 5);
   7902   unsigned sd = INSTR (4, 0);
   7903 
   7904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7905   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
   7906 }
   7907 
   7908 /* Convert float to double.  */
   7909 static void
   7910 fcvtcpu (sim_cpu *cpu)
   7911 {
   7912   unsigned sn = INSTR (9, 5);
   7913   unsigned sd = INSTR (4, 0);
   7914 
   7915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7916   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
   7917 }
   7918 
   7919 static void
   7920 do_FRINT (sim_cpu *cpu)
   7921 {
   7922   /* instr[31,23] = 0001 1110 0
   7923      instr[22]    = single(0)/double(1)
   7924      instr[21,18] = 1001
   7925      instr[17,15] = rounding mode
   7926      instr[14,10] = 10000
   7927      instr[9,5]   = source
   7928      instr[4,0]   = dest  */
   7929 
   7930   float val;
   7931   unsigned rs = INSTR (9, 5);
   7932   unsigned rd = INSTR (4, 0);
   7933   unsigned int rmode = INSTR (17, 15);
   7934 
   7935   NYI_assert (31, 23, 0x03C);
   7936   NYI_assert (21, 18, 0x9);
   7937   NYI_assert (14, 10, 0x10);
   7938 
   7939   if (rmode == 6 || rmode == 7)
   7940     /* FIXME: Add support for rmode == 6 exactness check.  */
   7941     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
   7942 
   7943   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   7944   if (INSTR (22, 22))
   7945     {
   7946       double val = aarch64_get_FP_double (cpu, rs);
   7947 
   7948       switch (rmode)
   7949 	{
   7950 	case 0: /* mode N: nearest or even.  */
   7951 	  {
   7952 	    double rval = round (val);
   7953 
   7954 	    if (val - rval == 0.5)
   7955 	      {
   7956 		if (((rval / 2.0) * 2.0) != rval)
   7957 		  rval += 1.0;
   7958 	      }
   7959 
   7960 	    aarch64_set_FP_double (cpu, rd, round (val));
   7961 	    return;
   7962 	  }
   7963 
   7964 	case 1: /* mode P: towards +inf.  */
   7965 	  if (val < 0.0)
   7966 	    aarch64_set_FP_double (cpu, rd, trunc (val));
   7967 	  else
   7968 	    aarch64_set_FP_double (cpu, rd, round (val));
   7969 	  return;
   7970 
   7971 	case 2: /* mode M: towards -inf.  */
   7972 	  if (val < 0.0)
   7973 	    aarch64_set_FP_double (cpu, rd, round (val));
   7974 	  else
   7975 	    aarch64_set_FP_double (cpu, rd, trunc (val));
   7976 	  return;
   7977 
   7978 	case 3: /* mode Z: towards 0.  */
   7979 	  aarch64_set_FP_double (cpu, rd, trunc (val));
   7980 	  return;
   7981 
   7982 	case 4: /* mode A: away from 0.  */
   7983 	  aarch64_set_FP_double (cpu, rd, round (val));
   7984 	  return;
   7985 
   7986 	case 6: /* mode X: use FPCR with exactness check.  */
   7987 	case 7: /* mode I: use FPCR mode.  */
   7988 	  HALT_NYI;
   7989 
   7990 	default:
   7991 	  HALT_UNALLOC;
   7992 	}
   7993     }
   7994 
   7995   val = aarch64_get_FP_float (cpu, rs);
   7996 
   7997   switch (rmode)
   7998     {
   7999     case 0: /* mode N: nearest or even.  */
   8000       {
   8001 	float rval = roundf (val);
   8002 
   8003 	if (val - rval == 0.5)
   8004 	  {
   8005 	    if (((rval / 2.0) * 2.0) != rval)
   8006 	      rval += 1.0;
   8007 	  }
   8008 
   8009 	aarch64_set_FP_float (cpu, rd, rval);
   8010 	return;
   8011       }
   8012 
   8013     case 1: /* mode P: towards +inf.  */
   8014       if (val < 0.0)
   8015 	aarch64_set_FP_float (cpu, rd, truncf (val));
   8016       else
   8017 	aarch64_set_FP_float (cpu, rd, roundf (val));
   8018       return;
   8019 
   8020     case 2: /* mode M: towards -inf.  */
   8021       if (val < 0.0)
   8022 	aarch64_set_FP_float (cpu, rd, truncf (val));
   8023       else
   8024 	aarch64_set_FP_float (cpu, rd, roundf (val));
   8025       return;
   8026 
   8027     case 3: /* mode Z: towards 0.  */
   8028       aarch64_set_FP_float (cpu, rd, truncf (val));
   8029       return;
   8030 
   8031     case 4: /* mode A: away from 0.  */
   8032       aarch64_set_FP_float (cpu, rd, roundf (val));
   8033       return;
   8034 
   8035     case 6: /* mode X: use FPCR with exactness check.  */
   8036     case 7: /* mode I: use FPCR mode.  */
   8037       HALT_NYI;
   8038 
   8039     default:
   8040       HALT_UNALLOC;
   8041     }
   8042 }
   8043 
   8044 /* Convert half to float.  */
   8045 static void
   8046 do_FCVT_half_to_single (sim_cpu *cpu)
   8047 {
   8048   unsigned rn = INSTR (9, 5);
   8049   unsigned rd = INSTR (4, 0);
   8050 
   8051   NYI_assert (31, 10, 0x7B890);
   8052 
   8053   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8054   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
   8055 }
   8056 
   8057 /* Convert half to double.  */
   8058 static void
   8059 do_FCVT_half_to_double (sim_cpu *cpu)
   8060 {
   8061   unsigned rn = INSTR (9, 5);
   8062   unsigned rd = INSTR (4, 0);
   8063 
   8064   NYI_assert (31, 10, 0x7B8B0);
   8065 
   8066   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8067   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
   8068 }
   8069 
   8070 static void
   8071 do_FCVT_single_to_half (sim_cpu *cpu)
   8072 {
   8073   unsigned rn = INSTR (9, 5);
   8074   unsigned rd = INSTR (4, 0);
   8075 
   8076   NYI_assert (31, 10, 0x788F0);
   8077 
   8078   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8079   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
   8080 }
   8081 
   8082 /* Convert double to half.  */
   8083 static void
   8084 do_FCVT_double_to_half (sim_cpu *cpu)
   8085 {
   8086   unsigned rn = INSTR (9, 5);
   8087   unsigned rd = INSTR (4, 0);
   8088 
   8089   NYI_assert (31, 10, 0x798F0);
   8090 
   8091   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8092   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
   8093 }
   8094 
   8095 static void
   8096 dexSimpleFPDataProc1Source (sim_cpu *cpu)
   8097 {
   8098   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
   8099      instr[30]    = 0
   8100      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
   8101      instr[28,25] = 1111
   8102      instr[24]    = 0
   8103      instr[23,22] ==> type : 00 ==> source is single,
   8104                              01 ==> source is double
   8105                              10 ==> UNALLOC
   8106                              11 ==> UNALLOC or source is half
   8107      instr[21]    = 1
   8108      instr[20,15] ==> opcode : with type 00 or 01
   8109                                000000 ==> FMOV, 000001 ==> FABS,
   8110                                000010 ==> FNEG, 000011 ==> FSQRT,
   8111                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
   8112                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
   8113                                001000 ==> FRINTN, 001001 ==> FRINTP,
   8114                                001010 ==> FRINTM, 001011 ==> FRINTZ,
   8115                                001100 ==> FRINTA, 001101 ==> UNALLOC
   8116                                001110 ==> FRINTX, 001111 ==> FRINTI
   8117                                with type 11
   8118                                000100 ==> FCVT (half-to-single)
   8119                                000101 ==> FCVT (half-to-double)
   8120 			       instr[14,10] = 10000.  */
   8121 
   8122   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
   8123   uint32_t type   = INSTR (23, 22);
   8124   uint32_t opcode = INSTR (20, 15);
   8125 
   8126   if (M_S != 0)
   8127     HALT_UNALLOC;
   8128 
   8129   if (type == 3)
   8130     {
   8131       if (opcode == 4)
   8132 	do_FCVT_half_to_single (cpu);
   8133       else if (opcode == 5)
   8134 	do_FCVT_half_to_double (cpu);
   8135       else
   8136 	HALT_UNALLOC;
   8137       return;
   8138     }
   8139 
   8140   if (type == 2)
   8141     HALT_UNALLOC;
   8142 
   8143   switch (opcode)
   8144     {
   8145     case 0:
   8146       if (type)
   8147 	ffmovd (cpu);
   8148       else
   8149 	ffmovs (cpu);
   8150       return;
   8151 
   8152     case 1:
   8153       if (type)
   8154 	fabcpu (cpu);
   8155       else
   8156 	fabss (cpu);
   8157       return;
   8158 
   8159     case 2:
   8160       if (type)
   8161 	fnegd (cpu);
   8162       else
   8163 	fnegs (cpu);
   8164       return;
   8165 
   8166     case 3:
   8167       if (type)
   8168 	fsqrtd (cpu);
   8169       else
   8170 	fsqrts (cpu);
   8171       return;
   8172 
   8173     case 4:
   8174       if (type)
   8175 	fcvtds (cpu);
   8176       else
   8177 	HALT_UNALLOC;
   8178       return;
   8179 
   8180     case 5:
   8181       if (type)
   8182 	HALT_UNALLOC;
   8183       fcvtcpu (cpu);
   8184       return;
   8185 
   8186     case 8:		/* FRINTN etc.  */
   8187     case 9:
   8188     case 10:
   8189     case 11:
   8190     case 12:
   8191     case 14:
   8192     case 15:
   8193        do_FRINT (cpu);
   8194        return;
   8195 
   8196     case 7:
   8197       if (INSTR (22, 22))
   8198 	do_FCVT_double_to_half (cpu);
   8199       else
   8200 	do_FCVT_single_to_half (cpu);
   8201       return;
   8202 
   8203     case 13:
   8204       HALT_NYI;
   8205 
   8206     default:
   8207       HALT_UNALLOC;
   8208     }
   8209 }
   8210 
   8211 /* 32 bit signed int to float.  */
   8212 static void
   8213 scvtf32 (sim_cpu *cpu)
   8214 {
   8215   unsigned rn = INSTR (9, 5);
   8216   unsigned sd = INSTR (4, 0);
   8217 
   8218   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8219   aarch64_set_FP_float
   8220     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
   8221 }
   8222 
   8223 /* signed int to float.  */
   8224 static void
   8225 scvtf (sim_cpu *cpu)
   8226 {
   8227   unsigned rn = INSTR (9, 5);
   8228   unsigned sd = INSTR (4, 0);
   8229 
   8230   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8231   aarch64_set_FP_float
   8232     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
   8233 }
   8234 
   8235 /* 32 bit signed int to double.  */
   8236 static void
   8237 scvtd32 (sim_cpu *cpu)
   8238 {
   8239   unsigned rn = INSTR (9, 5);
   8240   unsigned sd = INSTR (4, 0);
   8241 
   8242   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8243   aarch64_set_FP_double
   8244     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
   8245 }
   8246 
   8247 /* signed int to double.  */
   8248 static void
   8249 scvtd (sim_cpu *cpu)
   8250 {
   8251   unsigned rn = INSTR (9, 5);
   8252   unsigned sd = INSTR (4, 0);
   8253 
   8254   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8255   aarch64_set_FP_double
   8256     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
   8257 }
   8258 
   8259 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
   8260 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
   8261 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
   8262 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
   8263 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
   8264 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
   8265 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
   8266 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
   8267 
   8268 #define UINT_MIN 0
   8269 #define ULONG_MIN 0
   8270 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
   8271 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
   8272 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
   8273 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
   8274 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
   8275 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
   8276 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
   8277 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
   8278 
   8279 /* Check for FP exception conditions:
   8280      NaN raises IO
   8281      Infinity raises IO
   8282      Out of Range raises IO and IX and saturates value
   8283      Denormal raises ID and IX and sets to zero.  */
   8284 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)	\
   8285   do							\
   8286     {							\
   8287       switch (fpclassify (F))				\
   8288 	{						\
   8289 	case FP_INFINITE:				\
   8290 	case FP_NAN:					\
   8291 	  aarch64_set_FPSR (cpu, IO);			\
   8292 	  if (signbit (F))				\
   8293 	    VALUE = ITYPE##_MAX;			\
   8294 	  else						\
   8295 	    VALUE = ITYPE##_MIN;			\
   8296 	  break;					\
   8297 							\
   8298 	case FP_NORMAL:					\
   8299 	  if (F >= FTYPE##_##ITYPE##_MAX)		\
   8300 	    {						\
   8301 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
   8302 	      VALUE = ITYPE##_MAX;			\
   8303 	    }						\
   8304 	  else if (F <= FTYPE##_##ITYPE##_MIN)		\
   8305 	    {						\
   8306 	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
   8307 	      VALUE = ITYPE##_MIN;			\
   8308 	    }						\
   8309 	  break;					\
   8310 							\
   8311 	case FP_SUBNORMAL:				\
   8312 	  aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);	\
   8313 	  VALUE = 0;					\
   8314 	  break;					\
   8315 							\
   8316 	default:					\
   8317 	case FP_ZERO:					\
   8318 	  VALUE = 0;					\
   8319 	  break;					\
   8320 	}						\
   8321     }							\
   8322   while (0)
   8323 
   8324 /* 32 bit convert float to signed int truncate towards zero.  */
   8325 static void
   8326 fcvtszs32 (sim_cpu *cpu)
   8327 {
   8328   unsigned sn = INSTR (9, 5);
   8329   unsigned rd = INSTR (4, 0);
   8330   /* TODO : check that this rounds toward zero.  */
   8331   float   f = aarch64_get_FP_float (cpu, sn);
   8332   int32_t value = (int32_t) f;
   8333 
   8334   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
   8335 
   8336   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8337   /* Avoid sign extension to 64 bit.  */
   8338   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
   8339 }
   8340 
   8341 /* 64 bit convert float to signed int truncate towards zero.  */
   8342 static void
   8343 fcvtszs (sim_cpu *cpu)
   8344 {
   8345   unsigned sn = INSTR (9, 5);
   8346   unsigned rd = INSTR (4, 0);
   8347   float f = aarch64_get_FP_float (cpu, sn);
   8348   int64_t value = (int64_t) f;
   8349 
   8350   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
   8351 
   8352   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8353   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
   8354 }
   8355 
   8356 /* 32 bit convert double to signed int truncate towards zero.  */
   8357 static void
   8358 fcvtszd32 (sim_cpu *cpu)
   8359 {
   8360   unsigned sn = INSTR (9, 5);
   8361   unsigned rd = INSTR (4, 0);
   8362   /* TODO : check that this rounds toward zero.  */
   8363   double   d = aarch64_get_FP_double (cpu, sn);
   8364   int32_t  value = (int32_t) d;
   8365 
   8366   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
   8367 
   8368   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8369   /* Avoid sign extension to 64 bit.  */
   8370   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
   8371 }
   8372 
   8373 /* 64 bit convert double to signed int truncate towards zero.  */
   8374 static void
   8375 fcvtszd (sim_cpu *cpu)
   8376 {
   8377   unsigned sn = INSTR (9, 5);
   8378   unsigned rd = INSTR (4, 0);
   8379   /* TODO : check that this rounds toward zero.  */
   8380   double  d = aarch64_get_FP_double (cpu, sn);
   8381   int64_t value;
   8382 
   8383   value = (int64_t) d;
   8384 
   8385   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
   8386 
   8387   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8388   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
   8389 }
   8390 
   8391 static void
   8392 do_fcvtzu (sim_cpu *cpu)
   8393 {
   8394   /* instr[31]    = size: 32-bit (0), 64-bit (1)
   8395      instr[30,23] = 00111100
   8396      instr[22]    = type: single (0)/ double (1)
   8397      instr[21]    = enable (0)/disable(1) precision
   8398      instr[20,16] = 11001
   8399      instr[15,10] = precision
   8400      instr[9,5]   = Rs
   8401      instr[4,0]   = Rd.  */
   8402 
   8403   unsigned rs = INSTR (9, 5);
   8404   unsigned rd = INSTR (4, 0);
   8405 
   8406   NYI_assert (30, 23, 0x3C);
   8407   NYI_assert (20, 16, 0x19);
   8408 
   8409   if (INSTR (21, 21) != 1)
   8410     /* Convert to fixed point.  */
   8411     HALT_NYI;
   8412 
   8413   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8414   if (INSTR (31, 31))
   8415     {
   8416       /* Convert to unsigned 64-bit integer.  */
   8417       if (INSTR (22, 22))
   8418 	{
   8419 	  double  d = aarch64_get_FP_double (cpu, rs);
   8420 	  uint64_t value = (uint64_t) d;
   8421 
   8422 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
   8423 	  if (value != (1UL << 63))
   8424 	    RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
   8425 
   8426 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
   8427 	}
   8428       else
   8429 	{
   8430 	  float  f = aarch64_get_FP_float (cpu, rs);
   8431 	  uint64_t value = (uint64_t) f;
   8432 
   8433 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
   8434 	  if (value != (1UL << 63))
   8435 	    RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
   8436 
   8437 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
   8438 	}
   8439     }
   8440   else
   8441     {
   8442       uint32_t value;
   8443 
   8444       /* Convert to unsigned 32-bit integer.  */
   8445       if (INSTR (22, 22))
   8446 	{
   8447 	  double  d = aarch64_get_FP_double (cpu, rs);
   8448 
   8449 	  value = (uint32_t) d;
   8450 	  /* Do not raise an exception if we have reached UINT_MAX.  */
   8451 	  if (value != (1UL << 31))
   8452 	    RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
   8453 	}
   8454       else
   8455 	{
   8456 	  float  f = aarch64_get_FP_float (cpu, rs);
   8457 
   8458 	  value = (uint32_t) f;
   8459 	  /* Do not raise an exception if we have reached UINT_MAX.  */
   8460 	  if (value != (1UL << 31))
   8461 	    RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
   8462 	}
   8463 
   8464       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
   8465     }
   8466 }
   8467 
   8468 static void
   8469 do_UCVTF (sim_cpu *cpu)
   8470 {
   8471   /* instr[31]    = size: 32-bit (0), 64-bit (1)
   8472      instr[30,23] = 001 1110 0
   8473      instr[22]    = type: single (0)/ double (1)
   8474      instr[21]    = enable (0)/disable(1) precision
   8475      instr[20,16] = 0 0011
   8476      instr[15,10] = precision
   8477      instr[9,5]   = Rs
   8478      instr[4,0]   = Rd.  */
   8479 
   8480   unsigned rs = INSTR (9, 5);
   8481   unsigned rd = INSTR (4, 0);
   8482 
   8483   NYI_assert (30, 23, 0x3C);
   8484   NYI_assert (20, 16, 0x03);
   8485 
   8486   if (INSTR (21, 21) != 1)
   8487     HALT_NYI;
   8488 
   8489   /* FIXME: Add exception raising.  */
   8490   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8491   if (INSTR (31, 31))
   8492     {
   8493       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
   8494 
   8495       if (INSTR (22, 22))
   8496 	aarch64_set_FP_double (cpu, rd, (double) value);
   8497       else
   8498 	aarch64_set_FP_float (cpu, rd, (float) value);
   8499     }
   8500   else
   8501     {
   8502       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
   8503 
   8504       if (INSTR (22, 22))
   8505 	aarch64_set_FP_double (cpu, rd, (double) value);
   8506       else
   8507 	aarch64_set_FP_float (cpu, rd, (float) value);
   8508     }
   8509 }
   8510 
   8511 static void
   8512 float_vector_move (sim_cpu *cpu)
   8513 {
   8514   /* instr[31,17] == 100 1111 0101 0111
   8515      instr[16]    ==> direction 0=> to GR, 1=> from GR
   8516      instr[15,10] => ???
   8517      instr[9,5]   ==> source
   8518      instr[4,0]   ==> dest.  */
   8519 
   8520   unsigned rn = INSTR (9, 5);
   8521   unsigned rd = INSTR (4, 0);
   8522 
   8523   NYI_assert (31, 17, 0x4F57);
   8524 
   8525   if (INSTR (15, 10) != 0)
   8526     HALT_UNALLOC;
   8527 
   8528   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8529   if (INSTR (16, 16))
   8530     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
   8531   else
   8532     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
   8533 }
   8534 
   8535 static void
   8536 dexSimpleFPIntegerConvert (sim_cpu *cpu)
   8537 {
   8538   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   8539      instr[30     = 0
   8540      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
   8541      instr[28,25] = 1111
   8542      instr[24]    = 0
   8543      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
   8544      instr[21]    = 1
   8545      instr[20,19] = rmode
   8546      instr[18,16] = opcode
   8547      instr[15,10] = 10 0000  */
   8548 
   8549   uint32_t rmode_opcode;
   8550   uint32_t size_type;
   8551   uint32_t type;
   8552   uint32_t size;
   8553   uint32_t S;
   8554 
   8555   if (INSTR (31, 17) == 0x4F57)
   8556     {
   8557       float_vector_move (cpu);
   8558       return;
   8559     }
   8560 
   8561   size = INSTR (31, 31);
   8562   S = INSTR (29, 29);
   8563   if (S != 0)
   8564     HALT_UNALLOC;
   8565 
   8566   type = INSTR (23, 22);
   8567   if (type > 1)
   8568     HALT_UNALLOC;
   8569 
   8570   rmode_opcode = INSTR (20, 16);
   8571   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
   8572 
   8573   switch (rmode_opcode)
   8574     {
   8575     case 2:			/* SCVTF.  */
   8576       switch (size_type)
   8577 	{
   8578 	case 0: scvtf32 (cpu); return;
   8579 	case 1: scvtd32 (cpu); return;
   8580 	case 2: scvtf (cpu); return;
   8581 	case 3: scvtd (cpu); return;
   8582 	}
   8583 
   8584     case 6:			/* FMOV GR, Vec.  */
   8585       switch (size_type)
   8586 	{
   8587 	case 0:  gfmovs (cpu); return;
   8588 	case 3:  gfmovd (cpu); return;
   8589 	default: HALT_UNALLOC;
   8590 	}
   8591 
   8592     case 7:			/* FMOV vec, GR.  */
   8593       switch (size_type)
   8594 	{
   8595 	case 0:  fgmovs (cpu); return;
   8596 	case 3:  fgmovd (cpu); return;
   8597 	default: HALT_UNALLOC;
   8598 	}
   8599 
   8600     case 24:			/* FCVTZS.  */
   8601       switch (size_type)
   8602 	{
   8603 	case 0: fcvtszs32 (cpu); return;
   8604 	case 1: fcvtszd32 (cpu); return;
   8605 	case 2: fcvtszs (cpu); return;
   8606 	case 3: fcvtszd (cpu); return;
   8607 	}
   8608 
   8609     case 25: do_fcvtzu (cpu); return;
   8610     case 3:  do_UCVTF (cpu); return;
   8611 
   8612     case 0:	/* FCVTNS.  */
   8613     case 1:	/* FCVTNU.  */
   8614     case 4:	/* FCVTAS.  */
   8615     case 5:	/* FCVTAU.  */
   8616     case 8:	/* FCVPTS.  */
   8617     case 9:	/* FCVTPU.  */
   8618     case 16:	/* FCVTMS.  */
   8619     case 17:	/* FCVTMU.  */
   8620     default:
   8621       HALT_NYI;
   8622     }
   8623 }
   8624 
   8625 static void
   8626 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
   8627 {
   8628   uint32_t flags;
   8629 
   8630   /* FIXME: Add exception raising.  */
   8631   if (isnan (fvalue1) || isnan (fvalue2))
   8632     flags = C|V;
   8633   else if (isinf (fvalue1) && isinf (fvalue2))
   8634     {
   8635       /* Subtracting two infinities may give a NaN.  We only need to compare
   8636 	 the signs, which we can get from isinf.  */
   8637       int result = isinf (fvalue1) - isinf (fvalue2);
   8638 
   8639       if (result == 0)
   8640 	flags = Z|C;
   8641       else if (result < 0)
   8642 	flags = N;
   8643       else /* (result > 0).  */
   8644 	flags = C;
   8645     }
   8646   else
   8647     {
   8648       float result = fvalue1 - fvalue2;
   8649 
   8650       if (result == 0.0)
   8651 	flags = Z|C;
   8652       else if (result < 0)
   8653 	flags = N;
   8654       else /* (result > 0).  */
   8655 	flags = C;
   8656     }
   8657 
   8658   aarch64_set_CPSR (cpu, flags);
   8659 }
   8660 
   8661 static void
   8662 fcmps (sim_cpu *cpu)
   8663 {
   8664   unsigned sm = INSTR (20, 16);
   8665   unsigned sn = INSTR ( 9,  5);
   8666 
   8667   float fvalue1 = aarch64_get_FP_float (cpu, sn);
   8668   float fvalue2 = aarch64_get_FP_float (cpu, sm);
   8669 
   8670   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8671   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
   8672 }
   8673 
   8674 /* Float compare to zero -- Invalid Operation exception
   8675    only on signaling NaNs.  */
   8676 static void
   8677 fcmpzs (sim_cpu *cpu)
   8678 {
   8679   unsigned sn = INSTR ( 9,  5);
   8680   float fvalue1 = aarch64_get_FP_float (cpu, sn);
   8681 
   8682   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8683   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
   8684 }
   8685 
   8686 /* Float compare -- Invalid Operation exception on all NaNs.  */
   8687 static void
   8688 fcmpes (sim_cpu *cpu)
   8689 {
   8690   unsigned sm = INSTR (20, 16);
   8691   unsigned sn = INSTR ( 9,  5);
   8692 
   8693   float fvalue1 = aarch64_get_FP_float (cpu, sn);
   8694   float fvalue2 = aarch64_get_FP_float (cpu, sm);
   8695 
   8696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8697   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
   8698 }
   8699 
   8700 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
   8701 static void
   8702 fcmpzes (sim_cpu *cpu)
   8703 {
   8704   unsigned sn = INSTR ( 9,  5);
   8705   float fvalue1 = aarch64_get_FP_float (cpu, sn);
   8706 
   8707   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8708   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
   8709 }
   8710 
   8711 static void
   8712 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
   8713 {
   8714   uint32_t flags;
   8715 
   8716   /* FIXME: Add exception raising.  */
   8717   if (isnan (dval1) || isnan (dval2))
   8718     flags = C|V;
   8719   else if (isinf (dval1) && isinf (dval2))
   8720     {
   8721       /* Subtracting two infinities may give a NaN.  We only need to compare
   8722 	 the signs, which we can get from isinf.  */
   8723       int result = isinf (dval1) - isinf (dval2);
   8724 
   8725       if (result == 0)
   8726 	flags = Z|C;
   8727       else if (result < 0)
   8728 	flags = N;
   8729       else /* (result > 0).  */
   8730 	flags = C;
   8731     }
   8732   else
   8733     {
   8734       double result = dval1 - dval2;
   8735 
   8736       if (result == 0.0)
   8737 	flags = Z|C;
   8738       else if (result < 0)
   8739 	flags = N;
   8740       else /* (result > 0).  */
   8741 	flags = C;
   8742     }
   8743 
   8744   aarch64_set_CPSR (cpu, flags);
   8745 }
   8746 
   8747 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
   8748 static void
   8749 fcmpd (sim_cpu *cpu)
   8750 {
   8751   unsigned sm = INSTR (20, 16);
   8752   unsigned sn = INSTR ( 9,  5);
   8753 
   8754   double dvalue1 = aarch64_get_FP_double (cpu, sn);
   8755   double dvalue2 = aarch64_get_FP_double (cpu, sm);
   8756 
   8757   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8758   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
   8759 }
   8760 
   8761 /* Double compare to zero -- Invalid Operation exception
   8762    only on signaling NaNs.  */
   8763 static void
   8764 fcmpzd (sim_cpu *cpu)
   8765 {
   8766   unsigned sn = INSTR ( 9,  5);
   8767   double dvalue1 = aarch64_get_FP_double (cpu, sn);
   8768 
   8769   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8770   set_flags_for_double_compare (cpu, dvalue1, 0.0);
   8771 }
   8772 
   8773 /* Double compare -- Invalid Operation exception on all NaNs.  */
   8774 static void
   8775 fcmped (sim_cpu *cpu)
   8776 {
   8777   unsigned sm = INSTR (20, 16);
   8778   unsigned sn = INSTR ( 9,  5);
   8779 
   8780   double dvalue1 = aarch64_get_FP_double (cpu, sn);
   8781   double dvalue2 = aarch64_get_FP_double (cpu, sm);
   8782 
   8783   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8784   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
   8785 }
   8786 
   8787 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
   8788 static void
   8789 fcmpzed (sim_cpu *cpu)
   8790 {
   8791   unsigned sn = INSTR ( 9,  5);
   8792   double dvalue1 = aarch64_get_FP_double (cpu, sn);
   8793 
   8794   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8795   set_flags_for_double_compare (cpu, dvalue1, 0.0);
   8796 }
   8797 
   8798 static void
   8799 dexSimpleFPCompare (sim_cpu *cpu)
   8800 {
   8801   /* assert instr[28,25] == 1111
   8802      instr[30:24:21:13,10] = 0011000
   8803      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
   8804      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
   8805      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
   8806      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
   8807      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
   8808                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
   8809                               ow ==> UNALLOC  */
   8810   uint32_t dispatch;
   8811   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
   8812   uint32_t type = INSTR (23, 22);
   8813   uint32_t op = INSTR (15, 14);
   8814   uint32_t op2_2_0 = INSTR (2, 0);
   8815 
   8816   if (op2_2_0 != 0)
   8817     HALT_UNALLOC;
   8818 
   8819   if (M_S != 0)
   8820     HALT_UNALLOC;
   8821 
   8822   if (type > 1)
   8823     HALT_UNALLOC;
   8824 
   8825   if (op != 0)
   8826     HALT_UNALLOC;
   8827 
   8828   /* dispatch on type and top 2 bits of opcode.  */
   8829   dispatch = (type << 2) | INSTR (4, 3);
   8830 
   8831   switch (dispatch)
   8832     {
   8833     case 0: fcmps (cpu); return;
   8834     case 1: fcmpzs (cpu); return;
   8835     case 2: fcmpes (cpu); return;
   8836     case 3: fcmpzes (cpu); return;
   8837     case 4: fcmpd (cpu); return;
   8838     case 5: fcmpzd (cpu); return;
   8839     case 6: fcmped (cpu); return;
   8840     case 7: fcmpzed (cpu); return;
   8841     }
   8842 }
   8843 
   8844 static void
   8845 do_scalar_FADDP (sim_cpu *cpu)
   8846 {
   8847   /* instr [31,23] = 0111 1110 0
   8848      instr [22]    = single(0)/double(1)
   8849      instr [21,10] = 11 0000 1101 10
   8850      instr [9,5]   = Fn
   8851      instr [4,0]   = Fd.  */
   8852 
   8853   unsigned Fn = INSTR (9, 5);
   8854   unsigned Fd = INSTR (4, 0);
   8855 
   8856   NYI_assert (31, 23, 0x0FC);
   8857   NYI_assert (21, 10, 0xC36);
   8858 
   8859   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8860   if (INSTR (22, 22))
   8861     {
   8862       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
   8863       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
   8864 
   8865       aarch64_set_FP_double (cpu, Fd, val1 + val2);
   8866     }
   8867   else
   8868     {
   8869       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
   8870       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
   8871 
   8872       aarch64_set_FP_float (cpu, Fd, val1 + val2);
   8873     }
   8874 }
   8875 
   8876 /* Floating point absolute difference.  */
   8877 
   8878 static void
   8879 do_scalar_FABD (sim_cpu *cpu)
   8880 {
   8881   /* instr [31,23] = 0111 1110 1
   8882      instr [22]    = float(0)/double(1)
   8883      instr [21]    = 1
   8884      instr [20,16] = Rm
   8885      instr [15,10] = 1101 01
   8886      instr [9, 5]  = Rn
   8887      instr [4, 0]  = Rd.  */
   8888 
   8889   unsigned rm = INSTR (20, 16);
   8890   unsigned rn = INSTR (9, 5);
   8891   unsigned rd = INSTR (4, 0);
   8892 
   8893   NYI_assert (31, 23, 0x0FD);
   8894   NYI_assert (21, 21, 1);
   8895   NYI_assert (15, 10, 0x35);
   8896 
   8897   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8898   if (INSTR (22, 22))
   8899     aarch64_set_FP_double (cpu, rd,
   8900 			   fabs (aarch64_get_FP_double (cpu, rn)
   8901 				 - aarch64_get_FP_double (cpu, rm)));
   8902   else
   8903     aarch64_set_FP_float (cpu, rd,
   8904 			  fabsf (aarch64_get_FP_float (cpu, rn)
   8905 				 - aarch64_get_FP_float (cpu, rm)));
   8906 }
   8907 
   8908 static void
   8909 do_scalar_CMGT (sim_cpu *cpu)
   8910 {
   8911   /* instr [31,21] = 0101 1110 111
   8912      instr [20,16] = Rm
   8913      instr [15,10] = 00 1101
   8914      instr [9, 5]  = Rn
   8915      instr [4, 0]  = Rd.  */
   8916 
   8917   unsigned rm = INSTR (20, 16);
   8918   unsigned rn = INSTR (9, 5);
   8919   unsigned rd = INSTR (4, 0);
   8920 
   8921   NYI_assert (31, 21, 0x2F7);
   8922   NYI_assert (15, 10, 0x0D);
   8923 
   8924   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8925   aarch64_set_vec_u64 (cpu, rd, 0,
   8926 		       aarch64_get_vec_u64 (cpu, rn, 0) >
   8927 		       aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
   8928 }
   8929 
   8930 static void
   8931 do_scalar_USHR (sim_cpu *cpu)
   8932 {
   8933   /* instr [31,23] = 0111 1111 0
   8934      instr [22,16] = shift amount
   8935      instr [15,10] = 0000 01
   8936      instr [9, 5]  = Rn
   8937      instr [4, 0]  = Rd.  */
   8938 
   8939   unsigned amount = 128 - INSTR (22, 16);
   8940   unsigned rn = INSTR (9, 5);
   8941   unsigned rd = INSTR (4, 0);
   8942 
   8943   NYI_assert (31, 23, 0x0FE);
   8944   NYI_assert (15, 10, 0x01);
   8945 
   8946   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8947   aarch64_set_vec_u64 (cpu, rd, 0,
   8948 		       aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
   8949 }
   8950 
   8951 static void
   8952 do_scalar_SSHL (sim_cpu *cpu)
   8953 {
   8954   /* instr [31,21] = 0101 1110 111
   8955      instr [20,16] = Rm
   8956      instr [15,10] = 0100 01
   8957      instr [9, 5]  = Rn
   8958      instr [4, 0]  = Rd.  */
   8959 
   8960   unsigned rm = INSTR (20, 16);
   8961   unsigned rn = INSTR (9, 5);
   8962   unsigned rd = INSTR (4, 0);
   8963   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
   8964 
   8965   NYI_assert (31, 21, 0x2F7);
   8966   NYI_assert (15, 10, 0x11);
   8967 
   8968   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8969   if (shift >= 0)
   8970     aarch64_set_vec_s64 (cpu, rd, 0,
   8971 			 aarch64_get_vec_s64 (cpu, rn, 0) << shift);
   8972   else
   8973     aarch64_set_vec_s64 (cpu, rd, 0,
   8974 			 aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
   8975 }
   8976 
   8977 /* Floating point scalar compare greater than or equal to 0.  */
   8978 static void
   8979 do_scalar_FCMGE_zero (sim_cpu *cpu)
   8980 {
   8981   /* instr [31,23] = 0111 1110 1
   8982      instr [22,22] = size
   8983      instr [21,16] = 1000 00
   8984      instr [15,10] = 1100 10
   8985      instr [9, 5]  = Rn
   8986      instr [4, 0]  = Rd.  */
   8987 
   8988   unsigned size = INSTR (22, 22);
   8989   unsigned rn = INSTR (9, 5);
   8990   unsigned rd = INSTR (4, 0);
   8991 
   8992   NYI_assert (31, 23, 0x0FD);
   8993   NYI_assert (21, 16, 0x20);
   8994   NYI_assert (15, 10, 0x32);
   8995 
   8996   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   8997   if (size)
   8998     aarch64_set_vec_u64 (cpu, rd, 0,
   8999 			 aarch64_get_vec_double (cpu, rn, 0) >= 0.0 ? -1 : 0);
   9000   else
   9001     aarch64_set_vec_u32 (cpu, rd, 0,
   9002 			 aarch64_get_vec_float (cpu, rn, 0) >= 0.0 ? -1 : 0);
   9003 }
   9004 
   9005 /* Floating point scalar compare less than or equal to 0.  */
   9006 static void
   9007 do_scalar_FCMLE_zero (sim_cpu *cpu)
   9008 {
   9009   /* instr [31,23] = 0111 1110 1
   9010      instr [22,22] = size
   9011      instr [21,16] = 1000 00
   9012      instr [15,10] = 1101 10
   9013      instr [9, 5]  = Rn
   9014      instr [4, 0]  = Rd.  */
   9015 
   9016   unsigned size = INSTR (22, 22);
   9017   unsigned rn = INSTR (9, 5);
   9018   unsigned rd = INSTR (4, 0);
   9019 
   9020   NYI_assert (31, 23, 0x0FD);
   9021   NYI_assert (21, 16, 0x20);
   9022   NYI_assert (15, 10, 0x36);
   9023 
   9024   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9025   if (size)
   9026     aarch64_set_vec_u64 (cpu, rd, 0,
   9027 			 aarch64_get_vec_double (cpu, rn, 0) <= 0.0 ? -1 : 0);
   9028   else
   9029     aarch64_set_vec_u32 (cpu, rd, 0,
   9030 			 aarch64_get_vec_float (cpu, rn, 0) <= 0.0 ? -1 : 0);
   9031 }
   9032 
   9033 /* Floating point scalar compare greater than 0.  */
   9034 static void
   9035 do_scalar_FCMGT_zero (sim_cpu *cpu)
   9036 {
   9037   /* instr [31,23] = 0101 1110 1
   9038      instr [22,22] = size
   9039      instr [21,16] = 1000 00
   9040      instr [15,10] = 1100 10
   9041      instr [9, 5]  = Rn
   9042      instr [4, 0]  = Rd.  */
   9043 
   9044   unsigned size = INSTR (22, 22);
   9045   unsigned rn = INSTR (9, 5);
   9046   unsigned rd = INSTR (4, 0);
   9047 
   9048   NYI_assert (31, 23, 0x0BD);
   9049   NYI_assert (21, 16, 0x20);
   9050   NYI_assert (15, 10, 0x32);
   9051 
   9052   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9053   if (size)
   9054     aarch64_set_vec_u64 (cpu, rd, 0,
   9055 			 aarch64_get_vec_double (cpu, rn, 0) > 0.0 ? -1 : 0);
   9056   else
   9057     aarch64_set_vec_u32 (cpu, rd, 0,
   9058 			 aarch64_get_vec_float (cpu, rn, 0) > 0.0 ? -1 : 0);
   9059 }
   9060 
   9061 /* Floating point scalar compare equal to 0.  */
   9062 static void
   9063 do_scalar_FCMEQ_zero (sim_cpu *cpu)
   9064 {
   9065   /* instr [31,23] = 0101 1110 1
   9066      instr [22,22] = size
   9067      instr [21,16] = 1000 00
   9068      instr [15,10] = 1101 10
   9069      instr [9, 5]  = Rn
   9070      instr [4, 0]  = Rd.  */
   9071 
   9072   unsigned size = INSTR (22, 22);
   9073   unsigned rn = INSTR (9, 5);
   9074   unsigned rd = INSTR (4, 0);
   9075 
   9076   NYI_assert (31, 23, 0x0BD);
   9077   NYI_assert (21, 16, 0x20);
   9078   NYI_assert (15, 10, 0x36);
   9079 
   9080   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9081   if (size)
   9082     aarch64_set_vec_u64 (cpu, rd, 0,
   9083 			 aarch64_get_vec_double (cpu, rn, 0) == 0.0 ? -1 : 0);
   9084   else
   9085     aarch64_set_vec_u32 (cpu, rd, 0,
   9086 			 aarch64_get_vec_float (cpu, rn, 0) == 0.0 ? -1 : 0);
   9087 }
   9088 
   9089 /* Floating point scalar compare less than 0.  */
   9090 static void
   9091 do_scalar_FCMLT_zero (sim_cpu *cpu)
   9092 {
   9093   /* instr [31,23] = 0101 1110 1
   9094      instr [22,22] = size
   9095      instr [21,16] = 1000 00
   9096      instr [15,10] = 1110 10
   9097      instr [9, 5]  = Rn
   9098      instr [4, 0]  = Rd.  */
   9099 
   9100   unsigned size = INSTR (22, 22);
   9101   unsigned rn = INSTR (9, 5);
   9102   unsigned rd = INSTR (4, 0);
   9103 
   9104   NYI_assert (31, 23, 0x0BD);
   9105   NYI_assert (21, 16, 0x20);
   9106   NYI_assert (15, 10, 0x3A);
   9107 
   9108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9109   if (size)
   9110     aarch64_set_vec_u64 (cpu, rd, 0,
   9111 			 aarch64_get_vec_double (cpu, rn, 0) < 0.0 ? -1 : 0);
   9112   else
   9113     aarch64_set_vec_u32 (cpu, rd, 0,
   9114 			 aarch64_get_vec_float (cpu, rn, 0) < 0.0 ? -1 : 0);
   9115 }
   9116 
   9117 static void
   9118 do_scalar_shift (sim_cpu *cpu)
   9119 {
   9120   /* instr [31,23] = 0101 1111 0
   9121      instr [22,16] = shift amount
   9122      instr [15,10] = 0101 01   [SHL]
   9123      instr [15,10] = 0000 01   [SSHR]
   9124      instr [9, 5]  = Rn
   9125      instr [4, 0]  = Rd.  */
   9126 
   9127   unsigned rn = INSTR (9, 5);
   9128   unsigned rd = INSTR (4, 0);
   9129   unsigned amount;
   9130 
   9131   NYI_assert (31, 23, 0x0BE);
   9132 
   9133   if (INSTR (22, 22) == 0)
   9134     HALT_UNALLOC;
   9135 
   9136   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9137   switch (INSTR (15, 10))
   9138     {
   9139     case 0x01: /* SSHR */
   9140       amount = 128 - INSTR (22, 16);
   9141       aarch64_set_vec_s64 (cpu, rd, 0,
   9142 			   aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
   9143       return;
   9144     case 0x15: /* SHL */
   9145       amount = INSTR (22, 16) - 64;
   9146       aarch64_set_vec_u64 (cpu, rd, 0,
   9147 			   aarch64_get_vec_u64 (cpu, rn, 0) << amount);
   9148       return;
   9149     default:
   9150       HALT_NYI;
   9151     }
   9152 }
   9153 
   9154 /* FCMEQ FCMGT FCMGE.  */
   9155 static void
   9156 do_scalar_FCM (sim_cpu *cpu)
   9157 {
   9158   /* instr [31,30] = 01
   9159      instr [29]    = U
   9160      instr [28,24] = 1 1110
   9161      instr [23]    = E
   9162      instr [22]    = size
   9163      instr [21]    = 1
   9164      instr [20,16] = Rm
   9165      instr [15,12] = 1110
   9166      instr [11]    = AC
   9167      instr [10]    = 1
   9168      instr [9, 5]  = Rn
   9169      instr [4, 0]  = Rd.  */
   9170 
   9171   unsigned rm = INSTR (20, 16);
   9172   unsigned rn = INSTR (9, 5);
   9173   unsigned rd = INSTR (4, 0);
   9174   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
   9175   unsigned result;
   9176   float val1;
   9177   float val2;
   9178 
   9179   NYI_assert (31, 30, 1);
   9180   NYI_assert (28, 24, 0x1E);
   9181   NYI_assert (21, 21, 1);
   9182   NYI_assert (15, 12, 0xE);
   9183   NYI_assert (10, 10, 1);
   9184 
   9185   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9186   if (INSTR (22, 22))
   9187     {
   9188       double val1 = aarch64_get_FP_double (cpu, rn);
   9189       double val2 = aarch64_get_FP_double (cpu, rm);
   9190 
   9191       switch (EUac)
   9192 	{
   9193 	case 0: /* 000 */
   9194 	  result = val1 == val2;
   9195 	  break;
   9196 
   9197 	case 3: /* 011 */
   9198 	  val1 = fabs (val1);
   9199 	  val2 = fabs (val2);
   9200 	  /* Fall through. */
   9201 	case 2: /* 010 */
   9202 	  result = val1 >= val2;
   9203 	  break;
   9204 
   9205 	case 7: /* 111 */
   9206 	  val1 = fabs (val1);
   9207 	  val2 = fabs (val2);
   9208 	  /* Fall through. */
   9209 	case 6: /* 110 */
   9210 	  result = val1 > val2;
   9211 	  break;
   9212 
   9213 	default:
   9214 	  HALT_UNALLOC;
   9215 	}
   9216 
   9217       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
   9218       return;
   9219     }
   9220 
   9221   val1 = aarch64_get_FP_float (cpu, rn);
   9222   val2 = aarch64_get_FP_float (cpu, rm);
   9223 
   9224   switch (EUac)
   9225     {
   9226     case 0: /* 000 */
   9227       result = val1 == val2;
   9228       break;
   9229 
   9230     case 3: /* 011 */
   9231       val1 = fabsf (val1);
   9232       val2 = fabsf (val2);
   9233       /* Fall through. */
   9234     case 2: /* 010 */
   9235       result = val1 >= val2;
   9236       break;
   9237 
   9238     case 7: /* 111 */
   9239       val1 = fabsf (val1);
   9240       val2 = fabsf (val2);
   9241       /* Fall through. */
   9242     case 6: /* 110 */
   9243       result = val1 > val2;
   9244       break;
   9245 
   9246     default:
   9247       HALT_UNALLOC;
   9248     }
   9249 
   9250   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
   9251 }
   9252 
   9253 /* An alias of DUP.  */
   9254 static void
   9255 do_scalar_MOV (sim_cpu *cpu)
   9256 {
   9257   /* instr [31,21] = 0101 1110 000
   9258      instr [20,16] = imm5
   9259      instr [15,10] = 0000 01
   9260      instr [9, 5]  = Rn
   9261      instr [4, 0]  = Rd.  */
   9262 
   9263   unsigned rn = INSTR (9, 5);
   9264   unsigned rd = INSTR (4, 0);
   9265   unsigned index;
   9266 
   9267   NYI_assert (31, 21, 0x2F0);
   9268   NYI_assert (15, 10, 0x01);
   9269 
   9270   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9271   if (INSTR (16, 16))
   9272     {
   9273       /* 8-bit.  */
   9274       index = INSTR (20, 17);
   9275       aarch64_set_vec_u8
   9276 	(cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
   9277     }
   9278   else if (INSTR (17, 17))
   9279     {
   9280       /* 16-bit.  */
   9281       index = INSTR (20, 18);
   9282       aarch64_set_vec_u16
   9283 	(cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
   9284     }
   9285   else if (INSTR (18, 18))
   9286     {
   9287       /* 32-bit.  */
   9288       index = INSTR (20, 19);
   9289       aarch64_set_vec_u32
   9290 	(cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
   9291     }
   9292   else if (INSTR (19, 19))
   9293     {
   9294       /* 64-bit.  */
   9295       index = INSTR (20, 20);
   9296       aarch64_set_vec_u64
   9297 	(cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
   9298     }
   9299   else
   9300     HALT_UNALLOC;
   9301 }
   9302 
   9303 static void
   9304 do_scalar_NEG (sim_cpu *cpu)
   9305 {
   9306   /* instr [31,10] = 0111 1110 1110 0000 1011 10
   9307      instr [9, 5]  = Rn
   9308      instr [4, 0]  = Rd.  */
   9309 
   9310   unsigned rn = INSTR (9, 5);
   9311   unsigned rd = INSTR (4, 0);
   9312 
   9313   NYI_assert (31, 10, 0x1FB82E);
   9314 
   9315   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9316   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
   9317 }
   9318 
   9319 static void
   9320 do_scalar_USHL (sim_cpu *cpu)
   9321 {
   9322   /* instr [31,21] = 0111 1110 111
   9323      instr [20,16] = Rm
   9324      instr [15,10] = 0100 01
   9325      instr [9, 5]  = Rn
   9326      instr [4, 0]  = Rd.  */
   9327 
   9328   unsigned rm = INSTR (20, 16);
   9329   unsigned rn = INSTR (9, 5);
   9330   unsigned rd = INSTR (4, 0);
   9331   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
   9332 
   9333   NYI_assert (31, 21, 0x3F7);
   9334   NYI_assert (15, 10, 0x11);
   9335 
   9336   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9337   if (shift >= 0)
   9338     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
   9339   else
   9340     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
   9341 }
   9342 
   9343 static void
   9344 do_double_add (sim_cpu *cpu)
   9345 {
   9346   /* instr [31,21] = 0101 1110 111
   9347      instr [20,16] = Fn
   9348      instr [15,10] = 1000 01
   9349      instr [9,5]   = Fm
   9350      instr [4,0]   = Fd.  */
   9351   unsigned Fd;
   9352   unsigned Fm;
   9353   unsigned Fn;
   9354   double val1;
   9355   double val2;
   9356 
   9357   NYI_assert (31, 21, 0x2F7);
   9358   NYI_assert (15, 10, 0x21);
   9359 
   9360   Fd = INSTR (4, 0);
   9361   Fm = INSTR (9, 5);
   9362   Fn = INSTR (20, 16);
   9363 
   9364   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9365   val1 = aarch64_get_FP_double (cpu, Fm);
   9366   val2 = aarch64_get_FP_double (cpu, Fn);
   9367 
   9368   aarch64_set_FP_double (cpu, Fd, val1 + val2);
   9369 }
   9370 
   9371 static void
   9372 do_scalar_UCVTF (sim_cpu *cpu)
   9373 {
   9374   /* instr [31,23] = 0111 1110 0
   9375      instr [22]    = single(0)/double(1)
   9376      instr [21,10] = 10 0001 1101 10
   9377      instr [9,5]   = rn
   9378      instr [4,0]   = rd.  */
   9379 
   9380   unsigned rn = INSTR (9, 5);
   9381   unsigned rd = INSTR (4, 0);
   9382 
   9383   NYI_assert (31, 23, 0x0FC);
   9384   NYI_assert (21, 10, 0x876);
   9385 
   9386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9387   if (INSTR (22, 22))
   9388     {
   9389       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
   9390 
   9391       aarch64_set_vec_double (cpu, rd, 0, (double) val);
   9392     }
   9393   else
   9394     {
   9395       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
   9396 
   9397       aarch64_set_vec_float (cpu, rd, 0, (float) val);
   9398     }
   9399 }
   9400 
   9401 static void
   9402 do_scalar_vec (sim_cpu *cpu)
   9403 {
   9404   /* instr [30] = 1.  */
   9405   /* instr [28,25] = 1111.  */
   9406   switch (INSTR (31, 23))
   9407     {
   9408     case 0xBC:
   9409       switch (INSTR (15, 10))
   9410 	{
   9411 	case 0x01: do_scalar_MOV (cpu); return;
   9412 	case 0x39: do_scalar_FCM (cpu); return;
   9413 	case 0x3B: do_scalar_FCM (cpu); return;
   9414 	}
   9415       break;
   9416 
   9417     case 0xBE: do_scalar_shift (cpu); return;
   9418 
   9419     case 0xFC:
   9420       switch (INSTR (15, 10))
   9421 	{
   9422 	case 0x36:
   9423 	  switch (INSTR (21, 16))
   9424 	    {
   9425 	    case 0x30: do_scalar_FADDP (cpu); return;
   9426 	    case 0x21: do_scalar_UCVTF (cpu); return;
   9427 	    }
   9428 	  HALT_NYI;
   9429 	case 0x39: do_scalar_FCM (cpu); return;
   9430 	case 0x3B: do_scalar_FCM (cpu); return;
   9431 	}
   9432       break;
   9433 
   9434     case 0xFD:
   9435       switch (INSTR (15, 10))
   9436 	{
   9437 	case 0x0D: do_scalar_CMGT (cpu); return;
   9438 	case 0x11: do_scalar_USHL (cpu); return;
   9439 	case 0x2E: do_scalar_NEG (cpu); return;
   9440 	case 0x32: do_scalar_FCMGE_zero (cpu); return;
   9441 	case 0x35: do_scalar_FABD (cpu); return;
   9442 	case 0x36: do_scalar_FCMLE_zero (cpu); return;
   9443 	case 0x39: do_scalar_FCM (cpu); return;
   9444 	case 0x3B: do_scalar_FCM (cpu); return;
   9445 	default:
   9446 	  HALT_NYI;
   9447 	}
   9448 
   9449     case 0xFE: do_scalar_USHR (cpu); return;
   9450 
   9451     case 0xBD:
   9452       switch (INSTR (15, 10))
   9453 	{
   9454 	case 0x21: do_double_add (cpu); return;
   9455 	case 0x11: do_scalar_SSHL (cpu); return;
   9456 	case 0x32: do_scalar_FCMGT_zero (cpu); return;
   9457 	case 0x36: do_scalar_FCMEQ_zero (cpu); return;
   9458 	case 0x3A: do_scalar_FCMLT_zero (cpu); return;
   9459 	default:
   9460 	  HALT_NYI;
   9461 	}
   9462 
   9463     default:
   9464       HALT_NYI;
   9465     }
   9466 }
   9467 
   9468 static void
   9469 dexAdvSIMD1 (sim_cpu *cpu)
   9470 {
   9471   /* instr [28,25] = 1 111.  */
   9472 
   9473   /* We are currently only interested in the basic
   9474      scalar fp routines which all have bit 30 = 0.  */
   9475   if (INSTR (30, 30))
   9476     do_scalar_vec (cpu);
   9477 
   9478   /* instr[24] is set for FP data processing 3-source and clear for
   9479      all other basic scalar fp instruction groups.  */
   9480   else if (INSTR (24, 24))
   9481     dexSimpleFPDataProc3Source (cpu);
   9482 
   9483   /* instr[21] is clear for floating <-> fixed conversions and set for
   9484      all other basic scalar fp instruction groups.  */
   9485   else if (!INSTR (21, 21))
   9486     dexSimpleFPFixedConvert (cpu);
   9487 
   9488   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
   9489      11 ==> cond select,  00 ==> other.  */
   9490   else
   9491     switch (INSTR (11, 10))
   9492       {
   9493       case 1: dexSimpleFPCondCompare (cpu); return;
   9494       case 2: dexSimpleFPDataProc2Source (cpu); return;
   9495       case 3: dexSimpleFPCondSelect (cpu); return;
   9496 
   9497       default:
   9498 	/* Now an ordered cascade of tests.
   9499 	   FP immediate has instr [12] == 1.
   9500 	   FP compare has   instr [13] == 1.
   9501 	   FP Data Proc 1 Source has instr [14] == 1.
   9502 	   FP floating <--> integer conversions has instr [15] == 0.  */
   9503 	if (INSTR (12, 12))
   9504 	  dexSimpleFPImmediate (cpu);
   9505 
   9506 	else if (INSTR (13, 13))
   9507 	  dexSimpleFPCompare (cpu);
   9508 
   9509 	else if (INSTR (14, 14))
   9510 	  dexSimpleFPDataProc1Source (cpu);
   9511 
   9512 	else if (!INSTR (15, 15))
   9513 	  dexSimpleFPIntegerConvert (cpu);
   9514 
   9515 	else
   9516 	  /* If we get here then instr[15] == 1 which means UNALLOC.  */
   9517 	  HALT_UNALLOC;
   9518       }
   9519 }
   9520 
   9521 /* PC relative addressing.  */
   9522 
   9523 static void
   9524 pcadr (sim_cpu *cpu)
   9525 {
   9526   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
   9527      instr[30,29] = immlo
   9528      instr[23,5] = immhi.  */
   9529   uint64_t address;
   9530   unsigned rd = INSTR (4, 0);
   9531   uint32_t isPage = INSTR (31, 31);
   9532   union { int64_t u64; uint64_t s64; } imm;
   9533   uint64_t offset;
   9534 
   9535   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
   9536   offset = imm.u64;
   9537   offset = (offset << 2) | INSTR (30, 29);
   9538 
   9539   address = aarch64_get_PC (cpu);
   9540 
   9541   if (isPage)
   9542     {
   9543       offset <<= 12;
   9544       address &= ~0xfff;
   9545     }
   9546 
   9547   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9548   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
   9549 }
   9550 
   9551 /* Specific decode and execute for group Data Processing Immediate.  */
   9552 
   9553 static void
   9554 dexPCRelAddressing (sim_cpu *cpu)
   9555 {
   9556   /* assert instr[28,24] = 10000.  */
   9557   pcadr (cpu);
   9558 }
   9559 
   9560 /* Immediate logical.
   9561    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
   9562    16, 32 or 64 bit sequence pulled out at decode and possibly
   9563    inverting it..
   9564 
   9565    N.B. the output register (dest) can normally be Xn or SP
   9566    the exception occurs for flag setting instructions which may
   9567    only use Xn for the output (dest).  The input register can
   9568    never be SP.  */
   9569 
   9570 /* 32 bit and immediate.  */
   9571 static void
   9572 and32 (sim_cpu *cpu, uint32_t bimm)
   9573 {
   9574   unsigned rn = INSTR (9, 5);
   9575   unsigned rd = INSTR (4, 0);
   9576 
   9577   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9578   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   9579 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
   9580 }
   9581 
   9582 /* 64 bit and immediate.  */
   9583 static void
   9584 and64 (sim_cpu *cpu, uint64_t bimm)
   9585 {
   9586   unsigned rn = INSTR (9, 5);
   9587   unsigned rd = INSTR (4, 0);
   9588 
   9589   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9590   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   9591 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
   9592 }
   9593 
   9594 /* 32 bit and immediate set flags.  */
   9595 static void
   9596 ands32 (sim_cpu *cpu, uint32_t bimm)
   9597 {
   9598   unsigned rn = INSTR (9, 5);
   9599   unsigned rd = INSTR (4, 0);
   9600 
   9601   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   9602   uint32_t value2 = bimm;
   9603 
   9604   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9605   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   9606   set_flags_for_binop32 (cpu, value1 & value2);
   9607 }
   9608 
   9609 /* 64 bit and immediate set flags.  */
   9610 static void
   9611 ands64 (sim_cpu *cpu, uint64_t bimm)
   9612 {
   9613   unsigned rn = INSTR (9, 5);
   9614   unsigned rd = INSTR (4, 0);
   9615 
   9616   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   9617   uint64_t value2 = bimm;
   9618 
   9619   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9620   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   9621   set_flags_for_binop64 (cpu, value1 & value2);
   9622 }
   9623 
   9624 /* 32 bit exclusive or immediate.  */
   9625 static void
   9626 eor32 (sim_cpu *cpu, uint32_t bimm)
   9627 {
   9628   unsigned rn = INSTR (9, 5);
   9629   unsigned rd = INSTR (4, 0);
   9630 
   9631   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9632   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   9633 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
   9634 }
   9635 
   9636 /* 64 bit exclusive or immediate.  */
   9637 static void
   9638 eor64 (sim_cpu *cpu, uint64_t bimm)
   9639 {
   9640   unsigned rn = INSTR (9, 5);
   9641   unsigned rd = INSTR (4, 0);
   9642 
   9643   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9644   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   9645 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
   9646 }
   9647 
   9648 /* 32 bit or immediate.  */
   9649 static void
   9650 orr32 (sim_cpu *cpu, uint32_t bimm)
   9651 {
   9652   unsigned rn = INSTR (9, 5);
   9653   unsigned rd = INSTR (4, 0);
   9654 
   9655   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9656   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   9657 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
   9658 }
   9659 
   9660 /* 64 bit or immediate.  */
   9661 static void
   9662 orr64 (sim_cpu *cpu, uint64_t bimm)
   9663 {
   9664   unsigned rn = INSTR (9, 5);
   9665   unsigned rd = INSTR (4, 0);
   9666 
   9667   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9668   aarch64_set_reg_u64 (cpu, rd, SP_OK,
   9669 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
   9670 }
   9671 
   9672 /* Logical shifted register.
   9673    These allow an optional LSL, ASR, LSR or ROR to the second source
   9674    register with a count up to the register bit count.
   9675    N.B register args may not be SP.  */
   9676 
   9677 /* 32 bit AND shifted register.  */
   9678 static void
   9679 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9680 {
   9681   unsigned rm = INSTR (20, 16);
   9682   unsigned rn = INSTR (9, 5);
   9683   unsigned rd = INSTR (4, 0);
   9684 
   9685   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9686   aarch64_set_reg_u64
   9687     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
   9688      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
   9689 }
   9690 
   9691 /* 64 bit AND shifted register.  */
   9692 static void
   9693 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9694 {
   9695   unsigned rm = INSTR (20, 16);
   9696   unsigned rn = INSTR (9, 5);
   9697   unsigned rd = INSTR (4, 0);
   9698 
   9699   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9700   aarch64_set_reg_u64
   9701     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
   9702      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
   9703 }
   9704 
   9705 /* 32 bit AND shifted register setting flags.  */
   9706 static void
   9707 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9708 {
   9709   unsigned rm = INSTR (20, 16);
   9710   unsigned rn = INSTR (9, 5);
   9711   unsigned rd = INSTR (4, 0);
   9712 
   9713   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   9714   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   9715 			       shift, count);
   9716 
   9717   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9718   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   9719   set_flags_for_binop32 (cpu, value1 & value2);
   9720 }
   9721 
   9722 /* 64 bit AND shifted register setting flags.  */
   9723 static void
   9724 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9725 {
   9726   unsigned rm = INSTR (20, 16);
   9727   unsigned rn = INSTR (9, 5);
   9728   unsigned rd = INSTR (4, 0);
   9729 
   9730   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   9731   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
   9732 			       shift, count);
   9733 
   9734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9735   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   9736   set_flags_for_binop64 (cpu, value1 & value2);
   9737 }
   9738 
   9739 /* 32 bit BIC shifted register.  */
   9740 static void
   9741 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9742 {
   9743   unsigned rm = INSTR (20, 16);
   9744   unsigned rn = INSTR (9, 5);
   9745   unsigned rd = INSTR (4, 0);
   9746 
   9747   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9748   aarch64_set_reg_u64
   9749     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
   9750      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
   9751 }
   9752 
   9753 /* 64 bit BIC shifted register.  */
   9754 static void
   9755 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9756 {
   9757   unsigned rm = INSTR (20, 16);
   9758   unsigned rn = INSTR (9, 5);
   9759   unsigned rd = INSTR (4, 0);
   9760 
   9761   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9762   aarch64_set_reg_u64
   9763     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
   9764      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
   9765 }
   9766 
   9767 /* 32 bit BIC shifted register setting flags.  */
   9768 static void
   9769 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9770 {
   9771   unsigned rm = INSTR (20, 16);
   9772   unsigned rn = INSTR (9, 5);
   9773   unsigned rd = INSTR (4, 0);
   9774 
   9775   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   9776   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   9777 				 shift, count);
   9778 
   9779   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9780   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   9781   set_flags_for_binop32 (cpu, value1 & value2);
   9782 }
   9783 
   9784 /* 64 bit BIC shifted register setting flags.  */
   9785 static void
   9786 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9787 {
   9788   unsigned rm = INSTR (20, 16);
   9789   unsigned rn = INSTR (9, 5);
   9790   unsigned rd = INSTR (4, 0);
   9791 
   9792   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   9793   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
   9794 				 shift, count);
   9795 
   9796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9797   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   9798   set_flags_for_binop64 (cpu, value1 & value2);
   9799 }
   9800 
   9801 /* 32 bit EON shifted register.  */
   9802 static void
   9803 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9804 {
   9805   unsigned rm = INSTR (20, 16);
   9806   unsigned rn = INSTR (9, 5);
   9807   unsigned rd = INSTR (4, 0);
   9808 
   9809   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9810   aarch64_set_reg_u64
   9811     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
   9812      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
   9813 }
   9814 
   9815 /* 64 bit EON shifted register.  */
   9816 static void
   9817 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9818 {
   9819   unsigned rm = INSTR (20, 16);
   9820   unsigned rn = INSTR (9, 5);
   9821   unsigned rd = INSTR (4, 0);
   9822 
   9823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9824   aarch64_set_reg_u64
   9825     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
   9826      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
   9827 }
   9828 
   9829 /* 32 bit EOR shifted register.  */
   9830 static void
   9831 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9832 {
   9833   unsigned rm = INSTR (20, 16);
   9834   unsigned rn = INSTR (9, 5);
   9835   unsigned rd = INSTR (4, 0);
   9836 
   9837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9838   aarch64_set_reg_u64
   9839     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
   9840      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
   9841 }
   9842 
   9843 /* 64 bit EOR shifted register.  */
   9844 static void
   9845 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9846 {
   9847   unsigned rm = INSTR (20, 16);
   9848   unsigned rn = INSTR (9, 5);
   9849   unsigned rd = INSTR (4, 0);
   9850 
   9851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9852   aarch64_set_reg_u64
   9853     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
   9854      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
   9855 }
   9856 
   9857 /* 32 bit ORR shifted register.  */
   9858 static void
   9859 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9860 {
   9861   unsigned rm = INSTR (20, 16);
   9862   unsigned rn = INSTR (9, 5);
   9863   unsigned rd = INSTR (4, 0);
   9864 
   9865   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9866   aarch64_set_reg_u64
   9867     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
   9868      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
   9869 }
   9870 
   9871 /* 64 bit ORR shifted register.  */
   9872 static void
   9873 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9874 {
   9875   unsigned rm = INSTR (20, 16);
   9876   unsigned rn = INSTR (9, 5);
   9877   unsigned rd = INSTR (4, 0);
   9878 
   9879   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9880   aarch64_set_reg_u64
   9881     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
   9882      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
   9883 }
   9884 
   9885 /* 32 bit ORN shifted register.  */
   9886 static void
   9887 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9888 {
   9889   unsigned rm = INSTR (20, 16);
   9890   unsigned rn = INSTR (9, 5);
   9891   unsigned rd = INSTR (4, 0);
   9892 
   9893   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9894   aarch64_set_reg_u64
   9895     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
   9896      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
   9897 }
   9898 
   9899 /* 64 bit ORN shifted register.  */
   9900 static void
   9901 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
   9902 {
   9903   unsigned rm = INSTR (20, 16);
   9904   unsigned rn = INSTR (9, 5);
   9905   unsigned rd = INSTR (4, 0);
   9906 
   9907   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9908   aarch64_set_reg_u64
   9909     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
   9910      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
   9911 }
   9912 
   9913 static void
   9914 dexLogicalImmediate (sim_cpu *cpu)
   9915 {
   9916   /* assert instr[28,23] = 1001000
   9917      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
   9918      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
   9919      instr[22] = N : used to construct immediate mask
   9920      instr[21,16] = immr
   9921      instr[15,10] = imms
   9922      instr[9,5] = Rn
   9923      instr[4,0] = Rd  */
   9924 
   9925   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
   9926   uint32_t size = INSTR (31, 31);
   9927   uint32_t N = INSTR (22, 22);
   9928   /* uint32_t immr = INSTR (21, 16);.  */
   9929   /* uint32_t imms = INSTR (15, 10);.  */
   9930   uint32_t index = INSTR (22, 10);
   9931   uint64_t bimm64 = LITable [index];
   9932   uint32_t dispatch = INSTR (30, 29);
   9933 
   9934   if (~size & N)
   9935     HALT_UNALLOC;
   9936 
   9937   if (!bimm64)
   9938     HALT_UNALLOC;
   9939 
   9940   if (size == 0)
   9941     {
   9942       uint32_t bimm = (uint32_t) bimm64;
   9943 
   9944       switch (dispatch)
   9945 	{
   9946 	case 0: and32 (cpu, bimm); return;
   9947 	case 1: orr32 (cpu, bimm); return;
   9948 	case 2: eor32 (cpu, bimm); return;
   9949 	case 3: ands32 (cpu, bimm); return;
   9950 	}
   9951     }
   9952   else
   9953     {
   9954       switch (dispatch)
   9955 	{
   9956 	case 0: and64 (cpu, bimm64); return;
   9957 	case 1: orr64 (cpu, bimm64); return;
   9958 	case 2: eor64 (cpu, bimm64); return;
   9959 	case 3: ands64 (cpu, bimm64); return;
   9960 	}
   9961     }
   9962   HALT_UNALLOC;
   9963 }
   9964 
   9965 /* Immediate move.
   9966    The uimm argument is a 16 bit value to be inserted into the
   9967    target register the pos argument locates the 16 bit word in the
   9968    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
   9969    3} for 64 bit.
   9970    N.B register arg may not be SP so it should be.
   9971    accessed using the setGZRegisterXXX accessors.  */
   9972 
   9973 /* 32 bit move 16 bit immediate zero remaining shorts.  */
   9974 static void
   9975 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
   9976 {
   9977   unsigned rd = INSTR (4, 0);
   9978 
   9979   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9980   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
   9981 }
   9982 
   9983 /* 64 bit move 16 bit immediate zero remaining shorts.  */
   9984 static void
   9985 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
   9986 {
   9987   unsigned rd = INSTR (4, 0);
   9988 
   9989   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   9990   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
   9991 }
   9992 
   9993 /* 32 bit move 16 bit immediate negated.  */
   9994 static void
   9995 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
   9996 {
   9997   unsigned rd = INSTR (4, 0);
   9998 
   9999   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10000   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
   10001 }
   10002 
   10003 /* 64 bit move 16 bit immediate negated.  */
   10004 static void
   10005 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
   10006 {
   10007   unsigned rd = INSTR (4, 0);
   10008 
   10009   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10010   aarch64_set_reg_u64
   10011     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
   10012 		      ^ 0xffffffffffffffffULL));
   10013 }
   10014 
   10015 /* 32 bit move 16 bit immediate keep remaining shorts.  */
   10016 static void
   10017 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
   10018 {
   10019   unsigned rd = INSTR (4, 0);
   10020   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
   10021   uint32_t value = val << (pos * 16);
   10022   uint32_t mask = ~(0xffffU << (pos * 16));
   10023 
   10024   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10025   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
   10026 }
   10027 
   10028 /* 64 bit move 16 it immediate keep remaining shorts.  */
   10029 static void
   10030 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
   10031 {
   10032   unsigned rd = INSTR (4, 0);
   10033   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
   10034   uint64_t value = (uint64_t) val << (pos * 16);
   10035   uint64_t mask = ~(0xffffULL << (pos * 16));
   10036 
   10037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10038   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
   10039 }
   10040 
   10041 static void
   10042 dexMoveWideImmediate (sim_cpu *cpu)
   10043 {
   10044   /* assert instr[28:23] = 100101
   10045      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
   10046      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
   10047      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
   10048      instr[20,5] = uimm16
   10049      instr[4,0] = Rd  */
   10050 
   10051   /* N.B. the (multiple of 16) shift is applied by the called routine,
   10052      we just pass the multiplier.  */
   10053 
   10054   uint32_t imm;
   10055   uint32_t size = INSTR (31, 31);
   10056   uint32_t op = INSTR (30, 29);
   10057   uint32_t shift = INSTR (22, 21);
   10058 
   10059   /* 32 bit can only shift 0 or 1 lot of 16.
   10060      anything else is an unallocated instruction.  */
   10061   if (size == 0 && (shift > 1))
   10062     HALT_UNALLOC;
   10063 
   10064   if (op == 1)
   10065     HALT_UNALLOC;
   10066 
   10067   imm = INSTR (20, 5);
   10068 
   10069   if (size == 0)
   10070     {
   10071       if (op == 0)
   10072 	movn32 (cpu, imm, shift);
   10073       else if (op == 2)
   10074 	movz32 (cpu, imm, shift);
   10075       else
   10076 	movk32 (cpu, imm, shift);
   10077     }
   10078   else
   10079     {
   10080       if (op == 0)
   10081 	movn64 (cpu, imm, shift);
   10082       else if (op == 2)
   10083 	movz64 (cpu, imm, shift);
   10084       else
   10085 	movk64 (cpu, imm, shift);
   10086     }
   10087 }
   10088 
   10089 /* Bitfield operations.
   10090    These take a pair of bit positions r and s which are in {0..31}
   10091    or {0..63} depending on the instruction word size.
   10092    N.B register args may not be SP.  */
   10093 
   10094 /* OK, we start with ubfm which just needs to pick
   10095    some bits out of source zero the rest and write
   10096    the result to dest.  Just need two logical shifts.  */
   10097 
   10098 /* 32 bit bitfield move, left and right of affected zeroed
   10099    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
   10100 static void
   10101 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
   10102 {
   10103   unsigned rd;
   10104   unsigned rn = INSTR (9, 5);
   10105   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   10106 
   10107   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
   10108   if (r <= s)
   10109     {
   10110       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
   10111          We want only bits s:xxx:r at the bottom of the word
   10112          so we LSL bit s up to bit 31 i.e. by 31 - s
   10113          and then we LSR to bring bit 31 down to bit s - r
   10114 	 i.e. by 31 + r - s.  */
   10115       value <<= 31 - s;
   10116       value >>= 31 + r - s;
   10117     }
   10118   else
   10119     {
   10120       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
   10121          We want only bits s:xxx:0 starting at it 31-(r-1)
   10122          so we LSL bit s up to bit 31 i.e. by 31 - s
   10123          and then we LSL to bring bit 31 down to 31-(r-1)+s
   10124 	 i.e. by r - (s + 1).  */
   10125       value <<= 31 - s;
   10126       value >>= r - (s + 1);
   10127     }
   10128 
   10129   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10130   rd = INSTR (4, 0);
   10131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
   10132 }
   10133 
   10134 /* 64 bit bitfield move, left and right of affected zeroed
   10135    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
   10136 static void
   10137 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
   10138 {
   10139   unsigned rd;
   10140   unsigned rn = INSTR (9, 5);
   10141   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   10142 
   10143   if (r <= s)
   10144     {
   10145       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
   10146          We want only bits s:xxx:r at the bottom of the word.
   10147          So we LSL bit s up to bit 63 i.e. by 63 - s
   10148          and then we LSR to bring bit 63 down to bit s - r
   10149 	 i.e. by 63 + r - s.  */
   10150       value <<= 63 - s;
   10151       value >>= 63 + r - s;
   10152     }
   10153   else
   10154     {
   10155       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
   10156          We want only bits s:xxx:0 starting at it 63-(r-1).
   10157          So we LSL bit s up to bit 63 i.e. by 63 - s
   10158          and then we LSL to bring bit 63 down to 63-(r-1)+s
   10159 	 i.e. by r - (s + 1).  */
   10160       value <<= 63 - s;
   10161       value >>= r - (s + 1);
   10162     }
   10163 
   10164   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10165   rd = INSTR (4, 0);
   10166   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
   10167 }
   10168 
   10169 /* The signed versions need to insert sign bits
   10170    on the left of the inserted bit field. so we do
   10171    much the same as the unsigned version except we
   10172    use an arithmetic shift right -- this just means
   10173    we need to operate on signed values.  */
   10174 
   10175 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
   10176 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
   10177 static void
   10178 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
   10179 {
   10180   unsigned rd;
   10181   unsigned rn = INSTR (9, 5);
   10182   /* as per ubfm32 but use an ASR instead of an LSR.  */
   10183   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
   10184 
   10185   if (r <= s)
   10186     {
   10187       value <<= 31 - s;
   10188       value >>= 31 + r - s;
   10189     }
   10190   else
   10191     {
   10192       value <<= 31 - s;
   10193       value >>= r - (s + 1);
   10194     }
   10195 
   10196   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10197   rd = INSTR (4, 0);
   10198   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
   10199 }
   10200 
   10201 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
   10202 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
   10203 static void
   10204 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
   10205 {
   10206   unsigned rd;
   10207   unsigned rn = INSTR (9, 5);
   10208   /* acpu per ubfm but use an ASR instead of an LSR.  */
   10209   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
   10210 
   10211   if (r <= s)
   10212     {
   10213       value <<= 63 - s;
   10214       value >>= 63 + r - s;
   10215     }
   10216   else
   10217     {
   10218       value <<= 63 - s;
   10219       value >>= r - (s + 1);
   10220     }
   10221 
   10222   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10223   rd = INSTR (4, 0);
   10224   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
   10225 }
   10226 
   10227 /* Finally, these versions leave non-affected bits
   10228    as is. so we need to generate the bits as per
   10229    ubfm and also generate a mask to pick the
   10230    bits from the original and computed values.  */
   10231 
   10232 /* 32 bit bitfield move, non-affected bits left as is.
   10233    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
   10234 static void
   10235 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
   10236 {
   10237   unsigned rn = INSTR (9, 5);
   10238   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   10239   uint32_t mask = -1;
   10240   unsigned rd;
   10241   uint32_t value2;
   10242 
   10243   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
   10244   if (r <= s)
   10245     {
   10246       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
   10247          We want only bits s:xxx:r at the bottom of the word
   10248          so we LSL bit s up to bit 31 i.e. by 31 - s
   10249          and then we LSR to bring bit 31 down to bit s - r
   10250 	 i.e. by 31 + r - s.  */
   10251       value <<= 31 - s;
   10252       value >>= 31 + r - s;
   10253       /* the mask must include the same bits.  */
   10254       mask <<= 31 - s;
   10255       mask >>= 31 + r - s;
   10256     }
   10257   else
   10258     {
   10259       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
   10260          We want only bits s:xxx:0 starting at it 31-(r-1)
   10261          so we LSL bit s up to bit 31 i.e. by 31 - s
   10262          and then we LSL to bring bit 31 down to 31-(r-1)+s
   10263 	 i.e. by r - (s + 1).  */
   10264       value <<= 31 - s;
   10265       value >>= r - (s + 1);
   10266       /* The mask must include the same bits.  */
   10267       mask <<= 31 - s;
   10268       mask >>= r - (s + 1);
   10269     }
   10270 
   10271   rd = INSTR (4, 0);
   10272   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
   10273 
   10274   value2 &= ~mask;
   10275   value2 |= value;
   10276 
   10277   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10278   aarch64_set_reg_u64
   10279     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
   10280 }
   10281 
   10282 /* 64 bit bitfield move, non-affected bits left as is.
   10283    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
   10284 static void
   10285 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
   10286 {
   10287   unsigned rd;
   10288   unsigned rn = INSTR (9, 5);
   10289   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   10290   uint64_t mask = 0xffffffffffffffffULL;
   10291 
   10292   if (r <= s)
   10293     {
   10294       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
   10295          We want only bits s:xxx:r at the bottom of the word
   10296          so we LSL bit s up to bit 63 i.e. by 63 - s
   10297          and then we LSR to bring bit 63 down to bit s - r
   10298 	 i.e. by 63 + r - s.  */
   10299       value <<= 63 - s;
   10300       value >>= 63 + r - s;
   10301       /* The mask must include the same bits.  */
   10302       mask <<= 63 - s;
   10303       mask >>= 63 + r - s;
   10304     }
   10305   else
   10306     {
   10307       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
   10308          We want only bits s:xxx:0 starting at it 63-(r-1)
   10309          so we LSL bit s up to bit 63 i.e. by 63 - s
   10310          and then we LSL to bring bit 63 down to 63-(r-1)+s
   10311 	 i.e. by r - (s + 1).  */
   10312       value <<= 63 - s;
   10313       value >>= r - (s + 1);
   10314       /* The mask must include the same bits.  */
   10315       mask <<= 63 - s;
   10316       mask >>= r - (s + 1);
   10317     }
   10318 
   10319   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10320   rd = INSTR (4, 0);
   10321   aarch64_set_reg_u64
   10322     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
   10323 }
   10324 
   10325 static void
   10326 dexBitfieldImmediate (sim_cpu *cpu)
   10327 {
   10328   /* assert instr[28:23] = 100110
   10329      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
   10330      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
   10331      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
   10332      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
   10333      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
   10334      instr[9,5] = Rn
   10335      instr[4,0] = Rd  */
   10336 
   10337   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
   10338   uint32_t dispatch;
   10339   uint32_t imms;
   10340   uint32_t size = INSTR (31, 31);
   10341   uint32_t N = INSTR (22, 22);
   10342   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
   10343   /* or else we have an UNALLOC.  */
   10344   uint32_t immr = INSTR (21, 16);
   10345 
   10346   if (~size & N)
   10347     HALT_UNALLOC;
   10348 
   10349   if (!size && uimm (immr, 5, 5))
   10350     HALT_UNALLOC;
   10351 
   10352   imms = INSTR (15, 10);
   10353   if (!size && uimm (imms, 5, 5))
   10354     HALT_UNALLOC;
   10355 
   10356   /* Switch on combined size and op.  */
   10357   dispatch = INSTR (31, 29);
   10358   switch (dispatch)
   10359     {
   10360     case 0: sbfm32 (cpu, immr, imms); return;
   10361     case 1: bfm32 (cpu, immr, imms); return;
   10362     case 2: ubfm32 (cpu, immr, imms); return;
   10363     case 4: sbfm (cpu, immr, imms); return;
   10364     case 5: bfm (cpu, immr, imms); return;
   10365     case 6: ubfm (cpu, immr, imms); return;
   10366     default: HALT_UNALLOC;
   10367     }
   10368 }
   10369 
   10370 static void
   10371 do_EXTR_32 (sim_cpu *cpu)
   10372 {
   10373   /* instr[31:21] = 00010011100
   10374      instr[20,16] = Rm
   10375      instr[15,10] = imms :  0xxxxx for 32 bit
   10376      instr[9,5]   = Rn
   10377      instr[4,0]   = Rd  */
   10378   unsigned rm   = INSTR (20, 16);
   10379   unsigned imms = INSTR (15, 10) & 31;
   10380   unsigned rn   = INSTR ( 9,  5);
   10381   unsigned rd   = INSTR ( 4,  0);
   10382   uint64_t val1;
   10383   uint64_t val2;
   10384 
   10385   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
   10386   val1 >>= imms;
   10387   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   10388   val2 <<= (32 - imms);
   10389 
   10390   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   10391   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
   10392 }
   10393 
   10394 static void
   10395 do_EXTR_64 (sim_cpu *cpu)
   10396 {
   10397   /* instr[31:21] = 10010011100
   10398      instr[20,16] = Rm
   10399      instr[15,10] = imms
   10400      instr[9,5]   = Rn
   10401      instr[4,0]   = Rd  */
   10402   unsigned rm   = INSTR (20, 16);
   10403   unsigned imms = INSTR (15, 10) & 63;
   10404   unsigned rn   = INSTR ( 9,  5);
   10405   unsigned rd   = INSTR ( 4,  0);
   10406   uint64_t val;
   10407 
   10408   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
   10409   val >>= imms;
   10410   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
   10411 
   10412   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
   10413 }
   10414 
   10415 static void
   10416 dexExtractImmediate (sim_cpu *cpu)
   10417 {
   10418   /* assert instr[28:23] = 100111
   10419      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   10420      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
   10421      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
   10422      instr[21]    = op0 : must be 0 or UNALLOC
   10423      instr[20,16] = Rm
   10424      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
   10425      instr[9,5]   = Rn
   10426      instr[4,0]   = Rd  */
   10427 
   10428   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
   10429   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
   10430   uint32_t dispatch;
   10431   uint32_t size = INSTR (31, 31);
   10432   uint32_t N = INSTR (22, 22);
   10433   /* 32 bit operations must have imms[5] = 0
   10434      or else we have an UNALLOC.  */
   10435   uint32_t imms = INSTR (15, 10);
   10436 
   10437   if (size ^ N)
   10438     HALT_UNALLOC;
   10439 
   10440   if (!size && uimm (imms, 5, 5))
   10441     HALT_UNALLOC;
   10442 
   10443   /* Switch on combined size and op.  */
   10444   dispatch = INSTR (31, 29);
   10445 
   10446   if (dispatch == 0)
   10447     do_EXTR_32 (cpu);
   10448 
   10449   else if (dispatch == 4)
   10450     do_EXTR_64 (cpu);
   10451 
   10452   else if (dispatch == 1)
   10453     HALT_NYI;
   10454   else
   10455     HALT_UNALLOC;
   10456 }
   10457 
   10458 static void
   10459 dexDPImm (sim_cpu *cpu)
   10460 {
   10461   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
   10462      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
   10463      bits [25,23] of a DPImm are the secondary dispatch vector.  */
   10464   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
   10465 
   10466   switch (group2)
   10467     {
   10468     case DPIMM_PCADR_000:
   10469     case DPIMM_PCADR_001:
   10470       dexPCRelAddressing (cpu);
   10471       return;
   10472 
   10473     case DPIMM_ADDSUB_010:
   10474     case DPIMM_ADDSUB_011:
   10475       dexAddSubtractImmediate (cpu);
   10476       return;
   10477 
   10478     case DPIMM_LOG_100:
   10479       dexLogicalImmediate (cpu);
   10480       return;
   10481 
   10482     case DPIMM_MOV_101:
   10483       dexMoveWideImmediate (cpu);
   10484       return;
   10485 
   10486     case DPIMM_BITF_110:
   10487       dexBitfieldImmediate (cpu);
   10488       return;
   10489 
   10490     case DPIMM_EXTR_111:
   10491       dexExtractImmediate (cpu);
   10492       return;
   10493 
   10494     default:
   10495       /* Should never reach here.  */
   10496       HALT_NYI;
   10497     }
   10498 }
   10499 
   10500 static void
   10501 dexLoadUnscaledImmediate (sim_cpu *cpu)
   10502 {
   10503   /* instr[29,24] == 111_00
   10504      instr[21] == 0
   10505      instr[11,10] == 00
   10506      instr[31,30] = size
   10507      instr[26] = V
   10508      instr[23,22] = opc
   10509      instr[20,12] = simm9
   10510      instr[9,5] = rn may be SP.  */
   10511   /* unsigned rt = INSTR (4, 0);  */
   10512   uint32_t V = INSTR (26, 26);
   10513   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
   10514   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
   10515 
   10516   if (!V)
   10517     {
   10518       /* GReg operations.  */
   10519       switch (dispatch)
   10520 	{
   10521 	case 0:	 sturb (cpu, imm); return;
   10522 	case 1:	 ldurb32 (cpu, imm); return;
   10523 	case 2:	 ldursb64 (cpu, imm); return;
   10524 	case 3:	 ldursb32 (cpu, imm); return;
   10525 	case 4:	 sturh (cpu, imm); return;
   10526 	case 5:	 ldurh32 (cpu, imm); return;
   10527 	case 6:	 ldursh64 (cpu, imm); return;
   10528 	case 7:	 ldursh32 (cpu, imm); return;
   10529 	case 8:	 stur32 (cpu, imm); return;
   10530 	case 9:	 ldur32 (cpu, imm); return;
   10531 	case 10: ldursw (cpu, imm); return;
   10532 	case 12: stur64 (cpu, imm); return;
   10533 	case 13: ldur64 (cpu, imm); return;
   10534 
   10535 	case 14:
   10536 	  /* PRFUM NYI.  */
   10537 	  HALT_NYI;
   10538 
   10539 	default:
   10540 	case 11:
   10541 	case 15:
   10542 	  HALT_UNALLOC;
   10543 	}
   10544     }
   10545 
   10546   /* FReg operations.  */
   10547   switch (dispatch)
   10548     {
   10549     case 2:  fsturq (cpu, imm); return;
   10550     case 3:  fldurq (cpu, imm); return;
   10551     case 8:  fsturs (cpu, imm); return;
   10552     case 9:  fldurs (cpu, imm); return;
   10553     case 12: fsturd (cpu, imm); return;
   10554     case 13: fldurd (cpu, imm); return;
   10555 
   10556     case 0: /* STUR 8 bit FP.  */
   10557     case 1: /* LDUR 8 bit FP.  */
   10558     case 4: /* STUR 16 bit FP.  */
   10559     case 5: /* LDUR 8 bit FP.  */
   10560       HALT_NYI;
   10561 
   10562     default:
   10563     case 6:
   10564     case 7:
   10565     case 10:
   10566     case 11:
   10567     case 14:
   10568     case 15:
   10569       HALT_UNALLOC;
   10570     }
   10571 }
   10572 
   10573 /*  N.B. A preliminary note regarding all the ldrs<x>32
   10574     instructions
   10575 
   10576    The signed value loaded by these instructions is cast to unsigned
   10577    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
   10578    64 bit element of the GReg union. this performs a 32 bit sign extension
   10579    (as required) but avoids 64 bit sign extension, thus ensuring that the
   10580    top half of the register word is zero. this is what the spec demands
   10581    when a 32 bit load occurs.  */
   10582 
   10583 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
   10584 static void
   10585 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
   10586 {
   10587   unsigned int rn = INSTR (9, 5);
   10588   unsigned int rt = INSTR (4, 0);
   10589 
   10590   /* The target register may not be SP but the source may be
   10591      there is no scaling required for a byte load.  */
   10592   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
   10593   aarch64_set_reg_u64 (cpu, rt, NO_SP,
   10594 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
   10595 }
   10596 
   10597 /* 32 bit load sign-extended byte scaled or unscaled zero-
   10598    or sign-extended 32-bit register offset.  */
   10599 static void
   10600 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   10601 {
   10602   unsigned int rm = INSTR (20, 16);
   10603   unsigned int rn = INSTR (9, 5);
   10604   unsigned int rt = INSTR (4, 0);
   10605 
   10606   /* rn may reference SP, rm and rt must reference ZR.  */
   10607 
   10608   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10609   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   10610 				 extension);
   10611 
   10612   /* There is no scaling required for a byte load.  */
   10613   aarch64_set_reg_u64
   10614     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
   10615 						   + displacement));
   10616 }
   10617 
   10618 /* 32 bit load sign-extended byte unscaled signed 9 bit with
   10619    pre- or post-writeback.  */
   10620 static void
   10621 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   10622 {
   10623   uint64_t address;
   10624   unsigned int rn = INSTR (9, 5);
   10625   unsigned int rt = INSTR (4, 0);
   10626 
   10627   if (rn == rt && wb != NoWriteBack)
   10628     HALT_UNALLOC;
   10629 
   10630   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10631 
   10632   if (wb == Pre)
   10633       address += offset;
   10634 
   10635   aarch64_set_reg_u64 (cpu, rt, NO_SP,
   10636 		       (int64_t) aarch64_get_mem_s8 (cpu, address));
   10637 
   10638   if (wb == Post)
   10639     address += offset;
   10640 
   10641   if (wb != NoWriteBack)
   10642     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
   10643 }
   10644 
   10645 /* 8 bit store scaled.  */
   10646 static void
   10647 fstrb_abs (sim_cpu *cpu, uint32_t offset)
   10648 {
   10649   unsigned st = INSTR (4, 0);
   10650   unsigned rn = INSTR (9, 5);
   10651 
   10652   aarch64_set_mem_u8 (cpu,
   10653 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
   10654 		      aarch64_get_vec_u8 (cpu, st, 0));
   10655 }
   10656 
   10657 /* 8 bit store scaled or unscaled zero- or
   10658    sign-extended 8-bit register offset.  */
   10659 static void
   10660 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   10661 {
   10662   unsigned rm = INSTR (20, 16);
   10663   unsigned rn = INSTR (9, 5);
   10664   unsigned st = INSTR (4, 0);
   10665 
   10666   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10667   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   10668 			       extension);
   10669   uint64_t  displacement = scaling == Scaled ? extended : 0;
   10670 
   10671   aarch64_set_mem_u8
   10672     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
   10673 }
   10674 
   10675 /* 16 bit store scaled.  */
   10676 static void
   10677 fstrh_abs (sim_cpu *cpu, uint32_t offset)
   10678 {
   10679   unsigned st = INSTR (4, 0);
   10680   unsigned rn = INSTR (9, 5);
   10681 
   10682   aarch64_set_mem_u16
   10683     (cpu,
   10684      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
   10685      aarch64_get_vec_u16 (cpu, st, 0));
   10686 }
   10687 
   10688 /* 16 bit store scaled or unscaled zero-
   10689    or sign-extended 16-bit register offset.  */
   10690 static void
   10691 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   10692 {
   10693   unsigned rm = INSTR (20, 16);
   10694   unsigned rn = INSTR (9, 5);
   10695   unsigned st = INSTR (4, 0);
   10696 
   10697   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10698   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   10699 			       extension);
   10700   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
   10701 
   10702   aarch64_set_mem_u16
   10703     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
   10704 }
   10705 
   10706 /* 32 bit store scaled unsigned 12 bit.  */
   10707 static void
   10708 fstrs_abs (sim_cpu *cpu, uint32_t offset)
   10709 {
   10710   unsigned st = INSTR (4, 0);
   10711   unsigned rn = INSTR (9, 5);
   10712 
   10713   aarch64_set_mem_u32
   10714     (cpu,
   10715      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
   10716      aarch64_get_vec_u32 (cpu, st, 0));
   10717 }
   10718 
   10719 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
   10720 static void
   10721 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   10722 {
   10723   unsigned rn = INSTR (9, 5);
   10724   unsigned st = INSTR (4, 0);
   10725 
   10726   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10727 
   10728   if (wb != Post)
   10729     address += offset;
   10730 
   10731   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
   10732 
   10733   if (wb == Post)
   10734     address += offset;
   10735 
   10736   if (wb != NoWriteBack)
   10737     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   10738 }
   10739 
   10740 /* 32 bit store scaled or unscaled zero-
   10741    or sign-extended 32-bit register offset.  */
   10742 static void
   10743 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   10744 {
   10745   unsigned rm = INSTR (20, 16);
   10746   unsigned rn = INSTR (9, 5);
   10747   unsigned st = INSTR (4, 0);
   10748 
   10749   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10750   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   10751 			       extension);
   10752   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
   10753 
   10754   aarch64_set_mem_u32
   10755     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
   10756 }
   10757 
   10758 /* 64 bit store scaled unsigned 12 bit.  */
   10759 static void
   10760 fstrd_abs (sim_cpu *cpu, uint32_t offset)
   10761 {
   10762   unsigned st = INSTR (4, 0);
   10763   unsigned rn = INSTR (9, 5);
   10764 
   10765   aarch64_set_mem_u64
   10766     (cpu,
   10767      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
   10768      aarch64_get_vec_u64 (cpu, st, 0));
   10769 }
   10770 
   10771 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
   10772 static void
   10773 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   10774 {
   10775   unsigned rn = INSTR (9, 5);
   10776   unsigned st = INSTR (4, 0);
   10777 
   10778   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10779 
   10780   if (wb != Post)
   10781     address += offset;
   10782 
   10783   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
   10784 
   10785   if (wb == Post)
   10786     address += offset;
   10787 
   10788   if (wb != NoWriteBack)
   10789     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   10790 }
   10791 
   10792 /* 64 bit store scaled or unscaled zero-
   10793    or sign-extended 32-bit register offset.  */
   10794 static void
   10795 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   10796 {
   10797   unsigned rm = INSTR (20, 16);
   10798   unsigned rn = INSTR (9, 5);
   10799   unsigned st = INSTR (4, 0);
   10800 
   10801   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10802   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   10803 			       extension);
   10804   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
   10805 
   10806   aarch64_set_mem_u64
   10807     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
   10808 }
   10809 
   10810 /* 128 bit store scaled unsigned 12 bit.  */
   10811 static void
   10812 fstrq_abs (sim_cpu *cpu, uint32_t offset)
   10813 {
   10814   FRegister a;
   10815   unsigned st = INSTR (4, 0);
   10816   unsigned rn = INSTR (9, 5);
   10817   uint64_t addr;
   10818 
   10819   aarch64_get_FP_long_double (cpu, st, & a);
   10820 
   10821   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
   10822   aarch64_set_mem_long_double (cpu, addr, a);
   10823 }
   10824 
   10825 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
   10826 static void
   10827 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   10828 {
   10829   FRegister a;
   10830   unsigned rn = INSTR (9, 5);
   10831   unsigned st = INSTR (4, 0);
   10832   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10833 
   10834   if (wb != Post)
   10835     address += offset;
   10836 
   10837   aarch64_get_FP_long_double (cpu, st, & a);
   10838   aarch64_set_mem_long_double (cpu, address, a);
   10839 
   10840   if (wb == Post)
   10841     address += offset;
   10842 
   10843   if (wb != NoWriteBack)
   10844     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
   10845 }
   10846 
   10847 /* 128 bit store scaled or unscaled zero-
   10848    or sign-extended 32-bit register offset.  */
   10849 static void
   10850 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   10851 {
   10852   unsigned rm = INSTR (20, 16);
   10853   unsigned rn = INSTR (9, 5);
   10854   unsigned st = INSTR (4, 0);
   10855 
   10856   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   10857   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
   10858 			       extension);
   10859   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
   10860 
   10861   FRegister a;
   10862 
   10863   aarch64_get_FP_long_double (cpu, st, & a);
   10864   aarch64_set_mem_long_double (cpu, address + displacement, a);
   10865 }
   10866 
   10867 static void
   10868 dexLoadImmediatePrePost (sim_cpu *cpu)
   10869 {
   10870   /* instr[31,30] = size
   10871      instr[29,27] = 111
   10872      instr[26]    = V
   10873      instr[25,24] = 00
   10874      instr[23,22] = opc
   10875      instr[21]    = 0
   10876      instr[20,12] = simm9
   10877      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
   10878      instr[10]    = 0
   10879      instr[9,5]   = Rn may be SP.
   10880      instr[4,0]   = Rt */
   10881 
   10882   uint32_t  V        = INSTR (26, 26);
   10883   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
   10884   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
   10885   WriteBack wb       = INSTR (11, 11);
   10886 
   10887   if (!V)
   10888     {
   10889       /* GReg operations.  */
   10890       switch (dispatch)
   10891 	{
   10892 	case 0:	 strb_wb (cpu, imm, wb); return;
   10893 	case 1:	 ldrb32_wb (cpu, imm, wb); return;
   10894 	case 2:	 ldrsb_wb (cpu, imm, wb); return;
   10895 	case 3:	 ldrsb32_wb (cpu, imm, wb); return;
   10896 	case 4:	 strh_wb (cpu, imm, wb); return;
   10897 	case 5:	 ldrh32_wb (cpu, imm, wb); return;
   10898 	case 6:	 ldrsh64_wb (cpu, imm, wb); return;
   10899 	case 7:	 ldrsh32_wb (cpu, imm, wb); return;
   10900 	case 8:	 str32_wb (cpu, imm, wb); return;
   10901 	case 9:	 ldr32_wb (cpu, imm, wb); return;
   10902 	case 10: ldrsw_wb (cpu, imm, wb); return;
   10903 	case 12: str_wb (cpu, imm, wb); return;
   10904 	case 13: ldr_wb (cpu, imm, wb); return;
   10905 
   10906 	default:
   10907 	case 11:
   10908 	case 14:
   10909 	case 15:
   10910 	  HALT_UNALLOC;
   10911 	}
   10912     }
   10913 
   10914   /* FReg operations.  */
   10915   switch (dispatch)
   10916     {
   10917     case 2:  fstrq_wb (cpu, imm, wb); return;
   10918     case 3:  fldrq_wb (cpu, imm, wb); return;
   10919     case 8:  fstrs_wb (cpu, imm, wb); return;
   10920     case 9:  fldrs_wb (cpu, imm, wb); return;
   10921     case 12: fstrd_wb (cpu, imm, wb); return;
   10922     case 13: fldrd_wb (cpu, imm, wb); return;
   10923 
   10924     case 0:	  /* STUR 8 bit FP.  */
   10925     case 1:	  /* LDUR 8 bit FP.  */
   10926     case 4:	  /* STUR 16 bit FP.  */
   10927     case 5:	  /* LDUR 8 bit FP.  */
   10928       HALT_NYI;
   10929 
   10930     default:
   10931     case 6:
   10932     case 7:
   10933     case 10:
   10934     case 11:
   10935     case 14:
   10936     case 15:
   10937       HALT_UNALLOC;
   10938     }
   10939 }
   10940 
   10941 static void
   10942 dexLoadRegisterOffset (sim_cpu *cpu)
   10943 {
   10944   /* instr[31,30] = size
   10945      instr[29,27] = 111
   10946      instr[26]    = V
   10947      instr[25,24] = 00
   10948      instr[23,22] = opc
   10949      instr[21]    = 1
   10950      instr[20,16] = rm
   10951      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
   10952                              110 ==> SXTW, 111 ==> SXTX,
   10953                              ow ==> RESERVED
   10954      instr[12]    = scaled
   10955      instr[11,10] = 10
   10956      instr[9,5]   = rn
   10957      instr[4,0]   = rt.  */
   10958 
   10959   uint32_t  V = INSTR (26, 26);
   10960   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
   10961   Scaling   scale = INSTR (12, 12);
   10962   Extension extensionType = INSTR (15, 13);
   10963 
   10964   /* Check for illegal extension types.  */
   10965   if (uimm (extensionType, 1, 1) == 0)
   10966     HALT_UNALLOC;
   10967 
   10968   if (extensionType == UXTX || extensionType == SXTX)
   10969     extensionType = NoExtension;
   10970 
   10971   if (!V)
   10972     {
   10973       /* GReg operations.  */
   10974       switch (dispatch)
   10975 	{
   10976 	case 0:	 strb_scale_ext (cpu, scale, extensionType); return;
   10977 	case 1:	 ldrb32_scale_ext (cpu, scale, extensionType); return;
   10978 	case 2:	 ldrsb_scale_ext (cpu, scale, extensionType); return;
   10979 	case 3:	 ldrsb32_scale_ext (cpu, scale, extensionType); return;
   10980 	case 4:	 strh_scale_ext (cpu, scale, extensionType); return;
   10981 	case 5:	 ldrh32_scale_ext (cpu, scale, extensionType); return;
   10982 	case 6:	 ldrsh_scale_ext (cpu, scale, extensionType); return;
   10983 	case 7:	 ldrsh32_scale_ext (cpu, scale, extensionType); return;
   10984 	case 8:	 str32_scale_ext (cpu, scale, extensionType); return;
   10985 	case 9:	 ldr32_scale_ext (cpu, scale, extensionType); return;
   10986 	case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
   10987 	case 12: str_scale_ext (cpu, scale, extensionType); return;
   10988 	case 13: ldr_scale_ext (cpu, scale, extensionType); return;
   10989 	case 14: prfm_scale_ext (cpu, scale, extensionType); return;
   10990 
   10991 	default:
   10992 	case 11:
   10993 	case 15:
   10994 	  HALT_UNALLOC;
   10995 	}
   10996     }
   10997 
   10998   /* FReg operations.  */
   10999   switch (dispatch)
   11000     {
   11001     case 1: /* LDUR 8 bit FP.  */
   11002       HALT_NYI;
   11003     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
   11004     case 5: /* LDUR 8 bit FP.  */
   11005       HALT_NYI;
   11006     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
   11007     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
   11008 
   11009     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
   11010     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
   11011     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
   11012     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
   11013     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
   11014 
   11015     default:
   11016     case 6:
   11017     case 7:
   11018     case 10:
   11019     case 11:
   11020     case 14:
   11021     case 15:
   11022       HALT_UNALLOC;
   11023     }
   11024 }
   11025 
   11026 static void
   11027 dexLoadUnsignedImmediate (sim_cpu *cpu)
   11028 {
   11029   /* instr[29,24] == 111_01
   11030      instr[31,30] = size
   11031      instr[26]    = V
   11032      instr[23,22] = opc
   11033      instr[21,10] = uimm12 : unsigned immediate offset
   11034      instr[9,5]   = rn may be SP.
   11035      instr[4,0]   = rt.  */
   11036 
   11037   uint32_t V = INSTR (26,26);
   11038   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
   11039   uint32_t imm = INSTR (21, 10);
   11040 
   11041   if (!V)
   11042     {
   11043       /* GReg operations.  */
   11044       switch (dispatch)
   11045 	{
   11046 	case 0:  strb_abs (cpu, imm); return;
   11047 	case 1:  ldrb32_abs (cpu, imm); return;
   11048 	case 2:  ldrsb_abs (cpu, imm); return;
   11049 	case 3:  ldrsb32_abs (cpu, imm); return;
   11050 	case 4:  strh_abs (cpu, imm); return;
   11051 	case 5:  ldrh32_abs (cpu, imm); return;
   11052 	case 6:  ldrsh_abs (cpu, imm); return;
   11053 	case 7:  ldrsh32_abs (cpu, imm); return;
   11054 	case 8:  str32_abs (cpu, imm); return;
   11055 	case 9:  ldr32_abs (cpu, imm); return;
   11056 	case 10: ldrsw_abs (cpu, imm); return;
   11057 	case 12: str_abs (cpu, imm); return;
   11058 	case 13: ldr_abs (cpu, imm); return;
   11059 	case 14: prfm_abs (cpu, imm); return;
   11060 
   11061 	default:
   11062 	case 11:
   11063 	case 15:
   11064 	  HALT_UNALLOC;
   11065 	}
   11066     }
   11067 
   11068   /* FReg operations.  */
   11069   switch (dispatch)
   11070     {
   11071     case 0:  fstrb_abs (cpu, imm); return;
   11072     case 4:  fstrh_abs (cpu, imm); return;
   11073     case 8:  fstrs_abs (cpu, imm); return;
   11074     case 12: fstrd_abs (cpu, imm); return;
   11075     case 2:  fstrq_abs (cpu, imm); return;
   11076 
   11077     case 1:  fldrb_abs (cpu, imm); return;
   11078     case 5:  fldrh_abs (cpu, imm); return;
   11079     case 9:  fldrs_abs (cpu, imm); return;
   11080     case 13: fldrd_abs (cpu, imm); return;
   11081     case 3:  fldrq_abs (cpu, imm); return;
   11082 
   11083     default:
   11084     case 6:
   11085     case 7:
   11086     case 10:
   11087     case 11:
   11088     case 14:
   11089     case 15:
   11090       HALT_UNALLOC;
   11091     }
   11092 }
   11093 
   11094 static void
   11095 dexLoadExclusive (sim_cpu *cpu)
   11096 {
   11097   /* assert instr[29:24] = 001000;
   11098      instr[31,30] = size
   11099      instr[23] = 0 if exclusive
   11100      instr[22] = L : 1 if load, 0 if store
   11101      instr[21] = 1 if pair
   11102      instr[20,16] = Rs
   11103      instr[15] = o0 : 1 if ordered
   11104      instr[14,10] = Rt2
   11105      instr[9,5] = Rn
   11106      instr[4.0] = Rt.  */
   11107 
   11108   switch (INSTR (22, 21))
   11109     {
   11110     case 2:   ldxr (cpu); return;
   11111     case 0:   stxr (cpu); return;
   11112     default:  HALT_NYI;
   11113     }
   11114 }
   11115 
   11116 static void
   11117 dexLoadOther (sim_cpu *cpu)
   11118 {
   11119   uint32_t dispatch;
   11120 
   11121   /* instr[29,25] = 111_0
   11122      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
   11123      instr[21:11,10] is the secondary dispatch.  */
   11124   if (INSTR (24, 24))
   11125     {
   11126       dexLoadUnsignedImmediate (cpu);
   11127       return;
   11128     }
   11129 
   11130   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
   11131   switch (dispatch)
   11132     {
   11133     case 0: dexLoadUnscaledImmediate (cpu); return;
   11134     case 1: dexLoadImmediatePrePost (cpu); return;
   11135     case 3: dexLoadImmediatePrePost (cpu); return;
   11136     case 6: dexLoadRegisterOffset (cpu); return;
   11137 
   11138     default:
   11139     case 2:
   11140     case 4:
   11141     case 5:
   11142     case 7:
   11143       HALT_NYI;
   11144     }
   11145 }
   11146 
   11147 static void
   11148 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11149 {
   11150   unsigned rn = INSTR (14, 10);
   11151   unsigned rd = INSTR (9, 5);
   11152   unsigned rm = INSTR (4, 0);
   11153   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11154 
   11155   if ((rn == rd || rm == rd) && wb != NoWriteBack)
   11156     HALT_UNALLOC; /* ??? */
   11157 
   11158   offset <<= 2;
   11159 
   11160   if (wb != Post)
   11161     address += offset;
   11162 
   11163   aarch64_set_mem_u32 (cpu, address,
   11164 		       aarch64_get_reg_u32 (cpu, rm, NO_SP));
   11165   aarch64_set_mem_u32 (cpu, address + 4,
   11166 		       aarch64_get_reg_u32 (cpu, rn, NO_SP));
   11167 
   11168   if (wb == Post)
   11169     address += offset;
   11170 
   11171   if (wb != NoWriteBack)
   11172     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11173 }
   11174 
   11175 static void
   11176 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11177 {
   11178   unsigned rn = INSTR (14, 10);
   11179   unsigned rd = INSTR (9, 5);
   11180   unsigned rm = INSTR (4, 0);
   11181   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11182 
   11183   if ((rn == rd || rm == rd) && wb != NoWriteBack)
   11184     HALT_UNALLOC; /* ??? */
   11185 
   11186   offset <<= 3;
   11187 
   11188   if (wb != Post)
   11189     address += offset;
   11190 
   11191   aarch64_set_mem_u64 (cpu, address,
   11192 		       aarch64_get_reg_u64 (cpu, rm, NO_SP));
   11193   aarch64_set_mem_u64 (cpu, address + 8,
   11194 		       aarch64_get_reg_u64 (cpu, rn, NO_SP));
   11195 
   11196   if (wb == Post)
   11197     address += offset;
   11198 
   11199   if (wb != NoWriteBack)
   11200     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11201 }
   11202 
   11203 static void
   11204 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11205 {
   11206   unsigned rn = INSTR (14, 10);
   11207   unsigned rd = INSTR (9, 5);
   11208   unsigned rm = INSTR (4, 0);
   11209   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11210 
   11211   /* Treat this as unalloc to make sure we don't do it.  */
   11212   if (rn == rm)
   11213     HALT_UNALLOC;
   11214 
   11215   offset <<= 2;
   11216 
   11217   if (wb != Post)
   11218     address += offset;
   11219 
   11220   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
   11221   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
   11222 
   11223   if (wb == Post)
   11224     address += offset;
   11225 
   11226   if (wb != NoWriteBack)
   11227     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11228 }
   11229 
   11230 static void
   11231 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11232 {
   11233   unsigned rn = INSTR (14, 10);
   11234   unsigned rd = INSTR (9, 5);
   11235   unsigned rm = INSTR (4, 0);
   11236   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11237 
   11238   /* Treat this as unalloc to make sure we don't do it.  */
   11239   if (rn == rm)
   11240     HALT_UNALLOC;
   11241 
   11242   offset <<= 2;
   11243 
   11244   if (wb != Post)
   11245     address += offset;
   11246 
   11247   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
   11248   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
   11249 
   11250   if (wb == Post)
   11251     address += offset;
   11252 
   11253   if (wb != NoWriteBack)
   11254     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11255 }
   11256 
   11257 static void
   11258 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11259 {
   11260   unsigned rn = INSTR (14, 10);
   11261   unsigned rd = INSTR (9, 5);
   11262   unsigned rm = INSTR (4, 0);
   11263   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11264 
   11265   /* Treat this as unalloc to make sure we don't do it.  */
   11266   if (rn == rm)
   11267     HALT_UNALLOC;
   11268 
   11269   offset <<= 3;
   11270 
   11271   if (wb != Post)
   11272     address += offset;
   11273 
   11274   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
   11275   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
   11276 
   11277   if (wb == Post)
   11278     address += offset;
   11279 
   11280   if (wb != NoWriteBack)
   11281     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11282 }
   11283 
   11284 static void
   11285 dex_load_store_pair_gr (sim_cpu *cpu)
   11286 {
   11287   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
   11288      instr[29,25] = instruction encoding: 101_0
   11289      instr[26]    = V : 1 if fp 0 if gp
   11290      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
   11291      instr[22]    = load/store (1=> load)
   11292      instr[21,15] = signed, scaled, offset
   11293      instr[14,10] = Rn
   11294      instr[ 9, 5] = Rd
   11295      instr[ 4, 0] = Rm.  */
   11296 
   11297   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
   11298   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
   11299 
   11300   switch (dispatch)
   11301     {
   11302     case 2: store_pair_u32 (cpu, offset, Post); return;
   11303     case 3: load_pair_u32  (cpu, offset, Post); return;
   11304     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
   11305     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
   11306     case 6: store_pair_u32 (cpu, offset, Pre); return;
   11307     case 7: load_pair_u32  (cpu, offset, Pre); return;
   11308 
   11309     case 11: load_pair_s32  (cpu, offset, Post); return;
   11310     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
   11311     case 15: load_pair_s32  (cpu, offset, Pre); return;
   11312 
   11313     case 18: store_pair_u64 (cpu, offset, Post); return;
   11314     case 19: load_pair_u64  (cpu, offset, Post); return;
   11315     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
   11316     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
   11317     case 22: store_pair_u64 (cpu, offset, Pre); return;
   11318     case 23: load_pair_u64  (cpu, offset, Pre); return;
   11319 
   11320     default:
   11321       HALT_UNALLOC;
   11322     }
   11323 }
   11324 
   11325 static void
   11326 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11327 {
   11328   unsigned rn = INSTR (14, 10);
   11329   unsigned rd = INSTR (9, 5);
   11330   unsigned rm = INSTR (4, 0);
   11331   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11332 
   11333   offset <<= 2;
   11334 
   11335   if (wb != Post)
   11336     address += offset;
   11337 
   11338   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
   11339   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
   11340 
   11341   if (wb == Post)
   11342     address += offset;
   11343 
   11344   if (wb != NoWriteBack)
   11345     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11346 }
   11347 
   11348 static void
   11349 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11350 {
   11351   unsigned rn = INSTR (14, 10);
   11352   unsigned rd = INSTR (9, 5);
   11353   unsigned rm = INSTR (4, 0);
   11354   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11355 
   11356   offset <<= 3;
   11357 
   11358   if (wb != Post)
   11359     address += offset;
   11360 
   11361   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
   11362   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
   11363 
   11364   if (wb == Post)
   11365     address += offset;
   11366 
   11367   if (wb != NoWriteBack)
   11368     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11369 }
   11370 
   11371 static void
   11372 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11373 {
   11374   FRegister a;
   11375   unsigned rn = INSTR (14, 10);
   11376   unsigned rd = INSTR (9, 5);
   11377   unsigned rm = INSTR (4, 0);
   11378   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11379 
   11380   offset <<= 4;
   11381 
   11382   if (wb != Post)
   11383     address += offset;
   11384 
   11385   aarch64_get_FP_long_double (cpu, rm, & a);
   11386   aarch64_set_mem_long_double (cpu, address, a);
   11387   aarch64_get_FP_long_double (cpu, rn, & a);
   11388   aarch64_set_mem_long_double (cpu, address + 16, a);
   11389 
   11390   if (wb == Post)
   11391     address += offset;
   11392 
   11393   if (wb != NoWriteBack)
   11394     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11395 }
   11396 
   11397 static void
   11398 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11399 {
   11400   unsigned rn = INSTR (14, 10);
   11401   unsigned rd = INSTR (9, 5);
   11402   unsigned rm = INSTR (4, 0);
   11403   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11404 
   11405   if (rm == rn)
   11406     HALT_UNALLOC;
   11407 
   11408   offset <<= 2;
   11409 
   11410   if (wb != Post)
   11411     address += offset;
   11412 
   11413   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
   11414   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
   11415 
   11416   if (wb == Post)
   11417     address += offset;
   11418 
   11419   if (wb != NoWriteBack)
   11420     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11421 }
   11422 
   11423 static void
   11424 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11425 {
   11426   unsigned rn = INSTR (14, 10);
   11427   unsigned rd = INSTR (9, 5);
   11428   unsigned rm = INSTR (4, 0);
   11429   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11430 
   11431   if (rm == rn)
   11432     HALT_UNALLOC;
   11433 
   11434   offset <<= 3;
   11435 
   11436   if (wb != Post)
   11437     address += offset;
   11438 
   11439   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
   11440   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
   11441 
   11442   if (wb == Post)
   11443     address += offset;
   11444 
   11445   if (wb != NoWriteBack)
   11446     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11447 }
   11448 
   11449 static void
   11450 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
   11451 {
   11452   FRegister a;
   11453   unsigned rn = INSTR (14, 10);
   11454   unsigned rd = INSTR (9, 5);
   11455   unsigned rm = INSTR (4, 0);
   11456   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
   11457 
   11458   if (rm == rn)
   11459     HALT_UNALLOC;
   11460 
   11461   offset <<= 4;
   11462 
   11463   if (wb != Post)
   11464     address += offset;
   11465 
   11466   aarch64_get_mem_long_double (cpu, address, & a);
   11467   aarch64_set_FP_long_double (cpu, rm, a);
   11468   aarch64_get_mem_long_double (cpu, address + 16, & a);
   11469   aarch64_set_FP_long_double (cpu, rn, a);
   11470 
   11471   if (wb == Post)
   11472     address += offset;
   11473 
   11474   if (wb != NoWriteBack)
   11475     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
   11476 }
   11477 
   11478 static void
   11479 dex_load_store_pair_fp (sim_cpu *cpu)
   11480 {
   11481   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
   11482      instr[29,25] = instruction encoding
   11483      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
   11484      instr[22]    = load/store (1=> load)
   11485      instr[21,15] = signed, scaled, offset
   11486      instr[14,10] = Rn
   11487      instr[ 9, 5] = Rd
   11488      instr[ 4, 0] = Rm  */
   11489 
   11490   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
   11491   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
   11492 
   11493   switch (dispatch)
   11494     {
   11495     case 2: store_pair_float (cpu, offset, Post); return;
   11496     case 3: load_pair_float  (cpu, offset, Post); return;
   11497     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
   11498     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
   11499     case 6: store_pair_float (cpu, offset, Pre); return;
   11500     case 7: load_pair_float  (cpu, offset, Pre); return;
   11501 
   11502     case 10: store_pair_double (cpu, offset, Post); return;
   11503     case 11: load_pair_double  (cpu, offset, Post); return;
   11504     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
   11505     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
   11506     case 14: store_pair_double (cpu, offset, Pre); return;
   11507     case 15: load_pair_double  (cpu, offset, Pre); return;
   11508 
   11509     case 18: store_pair_long_double (cpu, offset, Post); return;
   11510     case 19: load_pair_long_double  (cpu, offset, Post); return;
   11511     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
   11512     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
   11513     case 22: store_pair_long_double (cpu, offset, Pre); return;
   11514     case 23: load_pair_long_double  (cpu, offset, Pre); return;
   11515 
   11516     default:
   11517       HALT_UNALLOC;
   11518     }
   11519 }
   11520 
   11521 static inline unsigned
   11522 vec_reg (unsigned v, unsigned o)
   11523 {
   11524   return (v + o) & 0x3F;
   11525 }
   11526 
   11527 /* Load multiple N-element structures to M consecutive registers.  */
   11528 static void
   11529 vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
   11530 {
   11531   int      all  = INSTR (30, 30);
   11532   unsigned size = INSTR (11, 10);
   11533   unsigned vd   = INSTR (4, 0);
   11534   unsigned rpt = (N == M) ? 1 : M;
   11535   unsigned selem = N;
   11536   unsigned i, j, k;
   11537 
   11538   switch (size)
   11539     {
   11540     case 0: /* 8-bit operations.  */
   11541       for (i = 0; i < rpt; i++)
   11542 	for (j = 0; j < (8 + (8 * all)); j++)
   11543 	  for (k = 0; k < selem; k++)
   11544 	    {
   11545 	      aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
   11546 				  aarch64_get_mem_u8 (cpu, address));
   11547 	      address += 1;
   11548 	    }
   11549       return;
   11550 
   11551     case 1: /* 16-bit operations.  */
   11552       for (i = 0; i < rpt; i++)
   11553 	for (j = 0; j < (4 + (4 * all)); j++)
   11554 	  for (k = 0; k < selem; k++)
   11555 	    {
   11556 	      aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
   11557 				   aarch64_get_mem_u16 (cpu, address));
   11558 	      address += 2;
   11559 	    }
   11560       return;
   11561 
   11562     case 2: /* 32-bit operations.  */
   11563       for (i = 0; i < rpt; i++)
   11564 	for (j = 0; j < (2 + (2 * all)); j++)
   11565 	  for (k = 0; k < selem; k++)
   11566 	    {
   11567 	      aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
   11568 				   aarch64_get_mem_u32 (cpu, address));
   11569 	      address += 4;
   11570 	    }
   11571       return;
   11572 
   11573     case 3: /* 64-bit operations.  */
   11574       for (i = 0; i < rpt; i++)
   11575 	for (j = 0; j < (1 + all); j++)
   11576 	  for (k = 0; k < selem; k++)
   11577 	    {
   11578 	      aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
   11579 				   aarch64_get_mem_u64 (cpu, address));
   11580 	      address += 8;
   11581 	    }
   11582       return;
   11583     }
   11584 }
   11585 
   11586 /* Load multiple 4-element structures into four consecutive registers.  */
   11587 static void
   11588 LD4 (sim_cpu *cpu, uint64_t address)
   11589 {
   11590   vec_load (cpu, address, 4, 4);
   11591 }
   11592 
   11593 /* Load multiple 3-element structures into three consecutive registers.  */
   11594 static void
   11595 LD3 (sim_cpu *cpu, uint64_t address)
   11596 {
   11597   vec_load (cpu, address, 3, 3);
   11598 }
   11599 
   11600 /* Load multiple 2-element structures into two consecutive registers.  */
   11601 static void
   11602 LD2 (sim_cpu *cpu, uint64_t address)
   11603 {
   11604   vec_load (cpu, address, 2, 2);
   11605 }
   11606 
   11607 /* Load multiple 1-element structures into one register.  */
   11608 static void
   11609 LD1_1 (sim_cpu *cpu, uint64_t address)
   11610 {
   11611   vec_load (cpu, address, 1, 1);
   11612 }
   11613 
   11614 /* Load multiple 1-element structures into two registers.  */
   11615 static void
   11616 LD1_2 (sim_cpu *cpu, uint64_t address)
   11617 {
   11618   vec_load (cpu, address, 1, 2);
   11619 }
   11620 
   11621 /* Load multiple 1-element structures into three registers.  */
   11622 static void
   11623 LD1_3 (sim_cpu *cpu, uint64_t address)
   11624 {
   11625   vec_load (cpu, address, 1, 3);
   11626 }
   11627 
   11628 /* Load multiple 1-element structures into four registers.  */
   11629 static void
   11630 LD1_4 (sim_cpu *cpu, uint64_t address)
   11631 {
   11632   vec_load (cpu, address, 1, 4);
   11633 }
   11634 
   11635 /* Store multiple N-element structures from M consecutive registers.  */
   11636 static void
   11637 vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
   11638 {
   11639   int      all  = INSTR (30, 30);
   11640   unsigned size = INSTR (11, 10);
   11641   unsigned vd   = INSTR (4, 0);
   11642   unsigned rpt = (N == M) ? 1 : M;
   11643   unsigned selem = N;
   11644   unsigned i, j, k;
   11645 
   11646   switch (size)
   11647     {
   11648     case 0: /* 8-bit operations.  */
   11649       for (i = 0; i < rpt; i++)
   11650 	for (j = 0; j < (8 + (8 * all)); j++)
   11651 	  for (k = 0; k < selem; k++)
   11652 	    {
   11653 	      aarch64_set_mem_u8
   11654 		(cpu, address,
   11655 		 aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
   11656 	      address += 1;
   11657 	    }
   11658       return;
   11659 
   11660     case 1: /* 16-bit operations.  */
   11661       for (i = 0; i < rpt; i++)
   11662 	for (j = 0; j < (4 + (4 * all)); j++)
   11663 	  for (k = 0; k < selem; k++)
   11664 	    {
   11665 	      aarch64_set_mem_u16
   11666 		(cpu, address,
   11667 		 aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
   11668 	      address += 2;
   11669 	    }
   11670       return;
   11671 
   11672     case 2: /* 32-bit operations.  */
   11673       for (i = 0; i < rpt; i++)
   11674 	for (j = 0; j < (2 + (2 * all)); j++)
   11675 	  for (k = 0; k < selem; k++)
   11676 	    {
   11677 	      aarch64_set_mem_u32
   11678 		(cpu, address,
   11679 		 aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
   11680 	      address += 4;
   11681 	    }
   11682       return;
   11683 
   11684     case 3: /* 64-bit operations.  */
   11685       for (i = 0; i < rpt; i++)
   11686 	for (j = 0; j < (1 + all); j++)
   11687 	  for (k = 0; k < selem; k++)
   11688 	    {
   11689 	      aarch64_set_mem_u64
   11690 		(cpu, address,
   11691 		 aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
   11692 	      address += 8;
   11693 	    }
   11694       return;
   11695     }
   11696 }
   11697 
   11698 /* Store multiple 4-element structure from four consecutive registers.  */
   11699 static void
   11700 ST4 (sim_cpu *cpu, uint64_t address)
   11701 {
   11702   vec_store (cpu, address, 4, 4);
   11703 }
   11704 
   11705 /* Store multiple 3-element structures from three consecutive registers.  */
   11706 static void
   11707 ST3 (sim_cpu *cpu, uint64_t address)
   11708 {
   11709   vec_store (cpu, address, 3, 3);
   11710 }
   11711 
   11712 /* Store multiple 2-element structures from two consecutive registers.  */
   11713 static void
   11714 ST2 (sim_cpu *cpu, uint64_t address)
   11715 {
   11716   vec_store (cpu, address, 2, 2);
   11717 }
   11718 
   11719 /* Store multiple 1-element structures from one register.  */
   11720 static void
   11721 ST1_1 (sim_cpu *cpu, uint64_t address)
   11722 {
   11723   vec_store (cpu, address, 1, 1);
   11724 }
   11725 
   11726 /* Store multiple 1-element structures from two registers.  */
   11727 static void
   11728 ST1_2 (sim_cpu *cpu, uint64_t address)
   11729 {
   11730   vec_store (cpu, address, 1, 2);
   11731 }
   11732 
   11733 /* Store multiple 1-element structures from three registers.  */
   11734 static void
   11735 ST1_3 (sim_cpu *cpu, uint64_t address)
   11736 {
   11737   vec_store (cpu, address, 1, 3);
   11738 }
   11739 
   11740 /* Store multiple 1-element structures from four registers.  */
   11741 static void
   11742 ST1_4 (sim_cpu *cpu, uint64_t address)
   11743 {
   11744   vec_store (cpu, address, 1, 4);
   11745 }
   11746 
   11747 #define LDn_STn_SINGLE_LANE_AND_SIZE()				\
   11748   do								\
   11749     {								\
   11750       switch (INSTR (15, 14))					\
   11751 	{							\
   11752 	case 0:							\
   11753 	  lane = (full << 3) | (s << 2) | size;			\
   11754 	  size = 0;						\
   11755 	  break;						\
   11756 								\
   11757 	case 1:							\
   11758 	  if ((size & 1) == 1)					\
   11759 	    HALT_UNALLOC;					\
   11760 	  lane = (full << 2) | (s << 1) | (size >> 1);		\
   11761 	  size = 1;						\
   11762 	  break;						\
   11763 								\
   11764 	case 2:							\
   11765 	  if ((size & 2) == 2)					\
   11766 	    HALT_UNALLOC;					\
   11767 								\
   11768 	  if ((size & 1) == 0)					\
   11769 	    {							\
   11770 	      lane = (full << 1) | s;				\
   11771 	      size = 2;						\
   11772 	    }							\
   11773 	  else							\
   11774 	    {							\
   11775 	      if (s)						\
   11776 		HALT_UNALLOC;					\
   11777 	      lane = full;					\
   11778 	      size = 3;						\
   11779 	    }							\
   11780 	  break;						\
   11781 								\
   11782 	default:						\
   11783 	  HALT_UNALLOC;						\
   11784 	}							\
   11785     }								\
   11786   while (0)
   11787 
   11788 /* Load single structure into one lane of N registers.  */
   11789 static void
   11790 do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
   11791 {
   11792   /* instr[31]    = 0
   11793      instr[30]    = element selector 0=>half, 1=>all elements
   11794      instr[29,24] = 00 1101
   11795      instr[23]    = 0=>simple, 1=>post
   11796      instr[22]    = 1
   11797      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
   11798      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
   11799                       11111 (immediate post inc)
   11800      instr[15,13] = opcode
   11801      instr[12]    = S, used for lane number
   11802      instr[11,10] = size, also used for lane number
   11803      instr[9,5]   = address
   11804      instr[4,0]   = Vd  */
   11805 
   11806   unsigned full = INSTR (30, 30);
   11807   unsigned vd = INSTR (4, 0);
   11808   unsigned size = INSTR (11, 10);
   11809   unsigned s = INSTR (12, 12);
   11810   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
   11811   int lane = 0;
   11812   int i;
   11813 
   11814   NYI_assert (29, 24, 0x0D);
   11815   NYI_assert (22, 22, 1);
   11816 
   11817   /* Compute the lane number first (using size), and then compute size.  */
   11818   LDn_STn_SINGLE_LANE_AND_SIZE ();
   11819 
   11820   for (i = 0; i < nregs; i++)
   11821     switch (size)
   11822       {
   11823       case 0:
   11824 	{
   11825 	  uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
   11826 	  aarch64_set_vec_u8 (cpu, vd + i, lane, val);
   11827 	  break;
   11828 	}
   11829 
   11830       case 1:
   11831 	{
   11832 	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
   11833 	  aarch64_set_vec_u16 (cpu, vd + i, lane, val);
   11834 	  break;
   11835 	}
   11836 
   11837       case 2:
   11838 	{
   11839 	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
   11840 	  aarch64_set_vec_u32 (cpu, vd + i, lane, val);
   11841 	  break;
   11842 	}
   11843 
   11844       case 3:
   11845 	{
   11846 	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
   11847 	  aarch64_set_vec_u64 (cpu, vd + i, lane, val);
   11848 	  break;
   11849 	}
   11850       }
   11851 }
   11852 
   11853 /* Store single structure from one lane from N registers.  */
   11854 static void
   11855 do_vec_STn_single (sim_cpu *cpu, uint64_t address)
   11856 {
   11857   /* instr[31]    = 0
   11858      instr[30]    = element selector 0=>half, 1=>all elements
   11859      instr[29,24] = 00 1101
   11860      instr[23]    = 0=>simple, 1=>post
   11861      instr[22]    = 0
   11862      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
   11863      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
   11864                       11111 (immediate post inc)
   11865      instr[15,13] = opcode
   11866      instr[12]    = S, used for lane number
   11867      instr[11,10] = size, also used for lane number
   11868      instr[9,5]   = address
   11869      instr[4,0]   = Vd  */
   11870 
   11871   unsigned full = INSTR (30, 30);
   11872   unsigned vd = INSTR (4, 0);
   11873   unsigned size = INSTR (11, 10);
   11874   unsigned s = INSTR (12, 12);
   11875   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
   11876   int lane = 0;
   11877   int i;
   11878 
   11879   NYI_assert (29, 24, 0x0D);
   11880   NYI_assert (22, 22, 0);
   11881 
   11882   /* Compute the lane number first (using size), and then compute size.  */
   11883   LDn_STn_SINGLE_LANE_AND_SIZE ();
   11884 
   11885   for (i = 0; i < nregs; i++)
   11886     switch (size)
   11887       {
   11888       case 0:
   11889 	{
   11890 	  uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
   11891 	  aarch64_set_mem_u8 (cpu, address + i, val);
   11892 	  break;
   11893 	}
   11894 
   11895       case 1:
   11896 	{
   11897 	  uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
   11898 	  aarch64_set_mem_u16 (cpu, address + (i * 2), val);
   11899 	  break;
   11900 	}
   11901 
   11902       case 2:
   11903 	{
   11904 	  uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
   11905 	  aarch64_set_mem_u32 (cpu, address + (i * 4), val);
   11906 	  break;
   11907 	}
   11908 
   11909       case 3:
   11910 	{
   11911 	  uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
   11912 	  aarch64_set_mem_u64 (cpu, address + (i * 8), val);
   11913 	  break;
   11914 	}
   11915       }
   11916 }
   11917 
   11918 /* Load single structure into all lanes of N registers.  */
   11919 static void
   11920 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
   11921 {
   11922   /* instr[31]    = 0
   11923      instr[30]    = element selector 0=>half, 1=>all elements
   11924      instr[29,24] = 00 1101
   11925      instr[23]    = 0=>simple, 1=>post
   11926      instr[22]    = 1
   11927      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
   11928      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
   11929                       11111 (immediate post inc)
   11930      instr[15,14] = 11
   11931      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
   11932      instr[12]    = 0
   11933      instr[11,10] = element size 00=> byte(b), 01=> half(h),
   11934                                  10=> word(s), 11=> double(d)
   11935      instr[9,5]   = address
   11936      instr[4,0]   = Vd  */
   11937 
   11938   unsigned full = INSTR (30, 30);
   11939   unsigned vd = INSTR (4, 0);
   11940   unsigned size = INSTR (11, 10);
   11941   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
   11942   int i, n;
   11943 
   11944   NYI_assert (29, 24, 0x0D);
   11945   NYI_assert (22, 22, 1);
   11946   NYI_assert (15, 14, 3);
   11947   NYI_assert (12, 12, 0);
   11948 
   11949   for (n = 0; n < nregs; n++)
   11950     switch (size)
   11951       {
   11952       case 0:
   11953 	{
   11954 	  uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
   11955 	  for (i = 0; i < (full ? 16 : 8); i++)
   11956 	    aarch64_set_vec_u8 (cpu, vd + n, i, val);
   11957 	  break;
   11958 	}
   11959 
   11960       case 1:
   11961 	{
   11962 	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
   11963 	  for (i = 0; i < (full ? 8 : 4); i++)
   11964 	    aarch64_set_vec_u16 (cpu, vd + n, i, val);
   11965 	  break;
   11966 	}
   11967 
   11968       case 2:
   11969 	{
   11970 	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
   11971 	  for (i = 0; i < (full ? 4 : 2); i++)
   11972 	    aarch64_set_vec_u32 (cpu, vd + n, i, val);
   11973 	  break;
   11974 	}
   11975 
   11976       case 3:
   11977 	{
   11978 	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
   11979 	  for (i = 0; i < (full ? 2 : 1); i++)
   11980 	    aarch64_set_vec_u64 (cpu, vd + n, i, val);
   11981 	  break;
   11982 	}
   11983 
   11984       default:
   11985 	HALT_UNALLOC;
   11986       }
   11987 }
   11988 
   11989 static void
   11990 do_vec_load_store (sim_cpu *cpu)
   11991 {
   11992   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
   11993 
   11994      instr[31]    = 0
   11995      instr[30]    = element selector 0=>half, 1=>all elements
   11996      instr[29,25] = 00110
   11997      instr[24]    = 0=>multiple struct, 1=>single struct
   11998      instr[23]    = 0=>simple, 1=>post
   11999      instr[22]    = 0=>store, 1=>load
   12000      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
   12001      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
   12002                     11111 (immediate post inc)
   12003      instr[15,12] = elements and destinations.  eg for load:
   12004                      0000=>LD4 => load multiple 4-element to
   12005 		     four consecutive registers
   12006                      0100=>LD3 => load multiple 3-element to
   12007 		     three consecutive registers
   12008                      1000=>LD2 => load multiple 2-element to
   12009 		     two consecutive registers
   12010                      0010=>LD1 => load multiple 1-element to
   12011 		     four consecutive registers
   12012                      0110=>LD1 => load multiple 1-element to
   12013 		     three consecutive registers
   12014                      1010=>LD1 => load multiple 1-element to
   12015 		     two consecutive registers
   12016                      0111=>LD1 => load multiple 1-element to
   12017 		     one register
   12018                      1100=>LDR1,LDR2
   12019                      1110=>LDR3,LDR4
   12020      instr[11,10] = element size 00=> byte(b), 01=> half(h),
   12021                                  10=> word(s), 11=> double(d)
   12022      instr[9,5]   = Vn, can be SP
   12023      instr[4,0]   = Vd  */
   12024 
   12025   int single;
   12026   int post;
   12027   int load;
   12028   unsigned vn;
   12029   uint64_t address;
   12030   int type;
   12031 
   12032   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
   12033     HALT_NYI;
   12034 
   12035   single = INSTR (24, 24);
   12036   post = INSTR (23, 23);
   12037   load = INSTR (22, 22);
   12038   type = INSTR (15, 12);
   12039   vn = INSTR (9, 5);
   12040   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
   12041 
   12042   if (! single && INSTR (21, 21) != 0)
   12043     HALT_UNALLOC;
   12044 
   12045   if (post)
   12046     {
   12047       unsigned vm = INSTR (20, 16);
   12048 
   12049       if (vm == R31)
   12050 	{
   12051 	  unsigned sizeof_operation;
   12052 
   12053 	  if (single)
   12054 	    {
   12055 	      if ((type >= 0) && (type <= 11))
   12056 		{
   12057 		  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
   12058 		  switch (INSTR (15, 14))
   12059 		    {
   12060 		    case 0:
   12061 		      sizeof_operation = nregs * 1;
   12062 		      break;
   12063 		    case 1:
   12064 		      sizeof_operation = nregs * 2;
   12065 		      break;
   12066 		    case 2:
   12067 		      if (INSTR (10, 10) == 0)
   12068 			sizeof_operation = nregs * 4;
   12069 		      else
   12070 			sizeof_operation = nregs * 8;
   12071 		      break;
   12072 		    default:
   12073 		      HALT_UNALLOC;
   12074 		    }
   12075 		}
   12076 	      else if (type == 0xC)
   12077 		{
   12078 		  sizeof_operation = INSTR (21, 21) ? 2 : 1;
   12079 		  sizeof_operation <<= INSTR (11, 10);
   12080 		}
   12081 	      else if (type == 0xE)
   12082 		{
   12083 		  sizeof_operation = INSTR (21, 21) ? 4 : 3;
   12084 		  sizeof_operation <<= INSTR (11, 10);
   12085 		}
   12086 	      else
   12087 		HALT_UNALLOC;
   12088 	    }
   12089 	  else
   12090 	    {
   12091 	      switch (type)
   12092 		{
   12093 		case 0: sizeof_operation = 32; break;
   12094 		case 4: sizeof_operation = 24; break;
   12095 		case 8: sizeof_operation = 16; break;
   12096 
   12097 		case 7:
   12098 		  /* One register, immediate offset variant.  */
   12099 		  sizeof_operation = 8;
   12100 		  break;
   12101 
   12102 		case 10:
   12103 		  /* Two registers, immediate offset variant.  */
   12104 		  sizeof_operation = 16;
   12105 		  break;
   12106 
   12107 		case 6:
   12108 		  /* Three registers, immediate offset variant.  */
   12109 		  sizeof_operation = 24;
   12110 		  break;
   12111 
   12112 		case 2:
   12113 		  /* Four registers, immediate offset variant.  */
   12114 		  sizeof_operation = 32;
   12115 		  break;
   12116 
   12117 		default:
   12118 		  HALT_UNALLOC;
   12119 		}
   12120 
   12121 	      if (INSTR (30, 30))
   12122 		sizeof_operation *= 2;
   12123 	    }
   12124 
   12125 	  aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
   12126 	}
   12127       else
   12128 	aarch64_set_reg_u64 (cpu, vn, SP_OK,
   12129 			     address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
   12130     }
   12131   else
   12132     {
   12133       NYI_assert (20, 16, 0);
   12134     }
   12135 
   12136   if (single)
   12137     {
   12138       if (load)
   12139 	{
   12140 	  if ((type >= 0) && (type <= 11))
   12141 	    do_vec_LDn_single (cpu, address);
   12142 	  else if ((type == 0xC) || (type == 0xE))
   12143 	    do_vec_LDnR (cpu, address);
   12144 	  else
   12145 	    HALT_UNALLOC;
   12146 	  return;
   12147 	}
   12148 
   12149       /* Stores.  */
   12150       if ((type >= 0) && (type <= 11))
   12151 	{
   12152 	  do_vec_STn_single (cpu, address);
   12153 	  return;
   12154 	}
   12155 
   12156       HALT_UNALLOC;
   12157     }
   12158 
   12159   if (load)
   12160     {
   12161       switch (type)
   12162 	{
   12163 	case 0:  LD4 (cpu, address); return;
   12164 	case 4:  LD3 (cpu, address); return;
   12165 	case 8:  LD2 (cpu, address); return;
   12166 	case 2:  LD1_4 (cpu, address); return;
   12167 	case 6:  LD1_3 (cpu, address); return;
   12168 	case 10: LD1_2 (cpu, address); return;
   12169 	case 7:  LD1_1 (cpu, address); return;
   12170 
   12171 	default:
   12172 	  HALT_UNALLOC;
   12173 	}
   12174     }
   12175 
   12176   /* Stores.  */
   12177   switch (type)
   12178     {
   12179     case 0:  ST4 (cpu, address); return;
   12180     case 4:  ST3 (cpu, address); return;
   12181     case 8:  ST2 (cpu, address); return;
   12182     case 2:  ST1_4 (cpu, address); return;
   12183     case 6:  ST1_3 (cpu, address); return;
   12184     case 10: ST1_2 (cpu, address); return;
   12185     case 7:  ST1_1 (cpu, address); return;
   12186     default:
   12187       HALT_UNALLOC;
   12188     }
   12189 }
   12190 
   12191 static void
   12192 dexLdSt (sim_cpu *cpu)
   12193 {
   12194   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
   12195      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
   12196              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
   12197      bits [29,28:26] of a LS are the secondary dispatch vector.  */
   12198   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
   12199 
   12200   switch (group2)
   12201     {
   12202     case LS_EXCL_000:
   12203       dexLoadExclusive (cpu); return;
   12204 
   12205     case LS_LIT_010:
   12206     case LS_LIT_011:
   12207       dexLoadLiteral (cpu); return;
   12208 
   12209     case LS_OTHER_110:
   12210     case LS_OTHER_111:
   12211       dexLoadOther (cpu); return;
   12212 
   12213     case LS_ADVSIMD_001:
   12214       do_vec_load_store (cpu); return;
   12215 
   12216     case LS_PAIR_100:
   12217       dex_load_store_pair_gr (cpu); return;
   12218 
   12219     case LS_PAIR_101:
   12220       dex_load_store_pair_fp (cpu); return;
   12221 
   12222     default:
   12223       /* Should never reach here.  */
   12224       HALT_NYI;
   12225     }
   12226 }
   12227 
   12228 /* Specific decode and execute for group Data Processing Register.  */
   12229 
   12230 static void
   12231 dexLogicalShiftedRegister (sim_cpu *cpu)
   12232 {
   12233   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   12234      instr[30,29] = op
   12235      instr[28:24] = 01010
   12236      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
   12237      instr[21]    = N
   12238      instr[20,16] = Rm
   12239      instr[15,10] = count : must be 0xxxxx for 32 bit
   12240      instr[9,5]   = Rn
   12241      instr[4,0]   = Rd  */
   12242 
   12243   uint32_t size      = INSTR (31, 31);
   12244   Shift    shiftType = INSTR (23, 22);
   12245   uint32_t count     = INSTR (15, 10);
   12246 
   12247   /* 32 bit operations must have count[5] = 0.
   12248      or else we have an UNALLOC.  */
   12249   if (size == 0 && uimm (count, 5, 5))
   12250     HALT_UNALLOC;
   12251 
   12252   /* Dispatch on size:op:N.  */
   12253   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
   12254     {
   12255     case 0: and32_shift  (cpu, shiftType, count); return;
   12256     case 1: bic32_shift  (cpu, shiftType, count); return;
   12257     case 2: orr32_shift  (cpu, shiftType, count); return;
   12258     case 3: orn32_shift  (cpu, shiftType, count); return;
   12259     case 4: eor32_shift  (cpu, shiftType, count); return;
   12260     case 5: eon32_shift  (cpu, shiftType, count); return;
   12261     case 6: ands32_shift (cpu, shiftType, count); return;
   12262     case 7: bics32_shift (cpu, shiftType, count); return;
   12263     case 8: and64_shift  (cpu, shiftType, count); return;
   12264     case 9: bic64_shift  (cpu, shiftType, count); return;
   12265     case 10:orr64_shift  (cpu, shiftType, count); return;
   12266     case 11:orn64_shift  (cpu, shiftType, count); return;
   12267     case 12:eor64_shift  (cpu, shiftType, count); return;
   12268     case 13:eon64_shift  (cpu, shiftType, count); return;
   12269     case 14:ands64_shift (cpu, shiftType, count); return;
   12270     case 15:bics64_shift (cpu, shiftType, count); return;
   12271     }
   12272 }
   12273 
   12274 /* 32 bit conditional select.  */
   12275 static void
   12276 csel32 (sim_cpu *cpu, CondCode cc)
   12277 {
   12278   unsigned rm = INSTR (20, 16);
   12279   unsigned rn = INSTR (9, 5);
   12280   unsigned rd = INSTR (4, 0);
   12281 
   12282   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12283 		       testConditionCode (cpu, cc)
   12284 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
   12285 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP));
   12286 }
   12287 
   12288 /* 64 bit conditional select.  */
   12289 static void
   12290 csel64 (sim_cpu *cpu, CondCode cc)
   12291 {
   12292   unsigned rm = INSTR (20, 16);
   12293   unsigned rn = INSTR (9, 5);
   12294   unsigned rd = INSTR (4, 0);
   12295 
   12296   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12297 		       testConditionCode (cpu, cc)
   12298 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
   12299 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP));
   12300 }
   12301 
   12302 /* 32 bit conditional increment.  */
   12303 static void
   12304 csinc32 (sim_cpu *cpu, CondCode cc)
   12305 {
   12306   unsigned rm = INSTR (20, 16);
   12307   unsigned rn = INSTR (9, 5);
   12308   unsigned rd = INSTR (4, 0);
   12309 
   12310   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12311 		       testConditionCode (cpu, cc)
   12312 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
   12313 		       : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
   12314 }
   12315 
   12316 /* 64 bit conditional increment.  */
   12317 static void
   12318 csinc64 (sim_cpu *cpu, CondCode cc)
   12319 {
   12320   unsigned rm = INSTR (20, 16);
   12321   unsigned rn = INSTR (9, 5);
   12322   unsigned rd = INSTR (4, 0);
   12323 
   12324   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12325 		       testConditionCode (cpu, cc)
   12326 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
   12327 		       : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
   12328 }
   12329 
   12330 /* 32 bit conditional invert.  */
   12331 static void
   12332 csinv32 (sim_cpu *cpu, CondCode cc)
   12333 {
   12334   unsigned rm = INSTR (20, 16);
   12335   unsigned rn = INSTR (9, 5);
   12336   unsigned rd = INSTR (4, 0);
   12337 
   12338   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12339 		       testConditionCode (cpu, cc)
   12340 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
   12341 		       : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
   12342 }
   12343 
   12344 /* 64 bit conditional invert.  */
   12345 static void
   12346 csinv64 (sim_cpu *cpu, CondCode cc)
   12347 {
   12348   unsigned rm = INSTR (20, 16);
   12349   unsigned rn = INSTR (9, 5);
   12350   unsigned rd = INSTR (4, 0);
   12351 
   12352   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12353 		       testConditionCode (cpu, cc)
   12354 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
   12355 		       : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
   12356 }
   12357 
   12358 /* 32 bit conditional negate.  */
   12359 static void
   12360 csneg32 (sim_cpu *cpu, CondCode cc)
   12361 {
   12362   unsigned rm = INSTR (20, 16);
   12363   unsigned rn = INSTR (9, 5);
   12364   unsigned rd = INSTR (4, 0);
   12365 
   12366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12367 		       testConditionCode (cpu, cc)
   12368 		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
   12369 		       : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
   12370 }
   12371 
   12372 /* 64 bit conditional negate.  */
   12373 static void
   12374 csneg64 (sim_cpu *cpu, CondCode cc)
   12375 {
   12376   unsigned rm = INSTR (20, 16);
   12377   unsigned rn = INSTR (9, 5);
   12378   unsigned rd = INSTR (4, 0);
   12379 
   12380   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12381 		       testConditionCode (cpu, cc)
   12382 		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
   12383 		       : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
   12384 }
   12385 
   12386 static void
   12387 dexCondSelect (sim_cpu *cpu)
   12388 {
   12389   /* instr[28,21] = 11011011
   12390      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   12391      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
   12392                             100 ==> CSINV, 101 ==> CSNEG,
   12393                             _1_ ==> UNALLOC
   12394      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
   12395      instr[15,12] = cond
   12396      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
   12397 
   12398   CondCode cc = INSTR (15, 12);
   12399   uint32_t S = INSTR (29, 29);
   12400   uint32_t op2 = INSTR (11, 10);
   12401 
   12402   if (S == 1)
   12403     HALT_UNALLOC;
   12404 
   12405   if (op2 & 0x2)
   12406     HALT_UNALLOC;
   12407 
   12408   switch ((INSTR (31, 30) << 1) | op2)
   12409     {
   12410     case 0: csel32  (cpu, cc); return;
   12411     case 1: csinc32 (cpu, cc); return;
   12412     case 2: csinv32 (cpu, cc); return;
   12413     case 3: csneg32 (cpu, cc); return;
   12414     case 4: csel64  (cpu, cc); return;
   12415     case 5: csinc64 (cpu, cc); return;
   12416     case 6: csinv64 (cpu, cc); return;
   12417     case 7: csneg64 (cpu, cc); return;
   12418     }
   12419 }
   12420 
   12421 /* Some helpers for counting leading 1 or 0 bits.  */
   12422 
   12423 /* Counts the number of leading bits which are the same
   12424    in a 32 bit value in the range 1 to 32.  */
   12425 static uint32_t
   12426 leading32 (uint32_t value)
   12427 {
   12428   int32_t mask= 0xffff0000;
   12429   uint32_t count= 16; /* Counts number of bits set in mask.  */
   12430   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
   12431   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
   12432 
   12433   while (lo + 1 < hi)
   12434     {
   12435       int32_t test = (value & mask);
   12436 
   12437       if (test == 0 || test == mask)
   12438 	{
   12439 	  lo = count;
   12440 	  count = (lo + hi) / 2;
   12441 	  mask >>= (count - lo);
   12442 	}
   12443       else
   12444 	{
   12445 	  hi = count;
   12446 	  count = (lo + hi) / 2;
   12447 	  mask <<= hi - count;
   12448 	}
   12449     }
   12450 
   12451   if (lo != hi)
   12452     {
   12453       int32_t test;
   12454 
   12455       mask >>= 1;
   12456       test = (value & mask);
   12457 
   12458       if (test == 0 || test == mask)
   12459 	count = hi;
   12460       else
   12461 	count = lo;
   12462     }
   12463 
   12464   return count;
   12465 }
   12466 
   12467 /* Counts the number of leading bits which are the same
   12468    in a 64 bit value in the range 1 to 64.  */
   12469 static uint64_t
   12470 leading64 (uint64_t value)
   12471 {
   12472   int64_t mask= 0xffffffff00000000LL;
   12473   uint64_t count = 32; /* Counts number of bits set in mask.  */
   12474   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
   12475   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
   12476 
   12477   while (lo + 1 < hi)
   12478     {
   12479       int64_t test = (value & mask);
   12480 
   12481       if (test == 0 || test == mask)
   12482 	{
   12483 	  lo = count;
   12484 	  count = (lo + hi) / 2;
   12485 	  mask >>= (count - lo);
   12486 	}
   12487       else
   12488 	{
   12489 	  hi = count;
   12490 	  count = (lo + hi) / 2;
   12491 	  mask <<= hi - count;
   12492 	}
   12493     }
   12494 
   12495   if (lo != hi)
   12496     {
   12497       int64_t test;
   12498 
   12499       mask >>= 1;
   12500       test = (value & mask);
   12501 
   12502       if (test == 0 || test == mask)
   12503 	count = hi;
   12504       else
   12505 	count = lo;
   12506     }
   12507 
   12508   return count;
   12509 }
   12510 
   12511 /* Bit operations.  */
   12512 /* N.B register args may not be SP.  */
   12513 
   12514 /* 32 bit count leading sign bits.  */
   12515 static void
   12516 cls32 (sim_cpu *cpu)
   12517 {
   12518   unsigned rn = INSTR (9, 5);
   12519   unsigned rd = INSTR (4, 0);
   12520 
   12521   /* N.B. the result needs to exclude the leading bit.  */
   12522   aarch64_set_reg_u64
   12523     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
   12524 }
   12525 
   12526 /* 64 bit count leading sign bits.  */
   12527 static void
   12528 cls64 (sim_cpu *cpu)
   12529 {
   12530   unsigned rn = INSTR (9, 5);
   12531   unsigned rd = INSTR (4, 0);
   12532 
   12533   /* N.B. the result needs to exclude the leading bit.  */
   12534   aarch64_set_reg_u64
   12535     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
   12536 }
   12537 
   12538 /* 32 bit count leading zero bits.  */
   12539 static void
   12540 clz32 (sim_cpu *cpu)
   12541 {
   12542   unsigned rn = INSTR (9, 5);
   12543   unsigned rd = INSTR (4, 0);
   12544   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   12545 
   12546   /* if the sign (top) bit is set then the count is 0.  */
   12547   if (pick32 (value, 31, 31))
   12548     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
   12549   else
   12550     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
   12551 }
   12552 
   12553 /* 64 bit count leading zero bits.  */
   12554 static void
   12555 clz64 (sim_cpu *cpu)
   12556 {
   12557   unsigned rn = INSTR (9, 5);
   12558   unsigned rd = INSTR (4, 0);
   12559   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   12560 
   12561   /* if the sign (top) bit is set then the count is 0.  */
   12562   if (pick64 (value, 63, 63))
   12563     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
   12564   else
   12565     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
   12566 }
   12567 
   12568 /* 32 bit reverse bits.  */
   12569 static void
   12570 rbit32 (sim_cpu *cpu)
   12571 {
   12572   unsigned rn = INSTR (9, 5);
   12573   unsigned rd = INSTR (4, 0);
   12574   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   12575   uint32_t result = 0;
   12576   int i;
   12577 
   12578   for (i = 0; i < 32; i++)
   12579     {
   12580       result <<= 1;
   12581       result |= (value & 1);
   12582       value >>= 1;
   12583     }
   12584   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   12585 }
   12586 
   12587 /* 64 bit reverse bits.  */
   12588 static void
   12589 rbit64 (sim_cpu *cpu)
   12590 {
   12591   unsigned rn = INSTR (9, 5);
   12592   unsigned rd = INSTR (4, 0);
   12593   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   12594   uint64_t result = 0;
   12595   int i;
   12596 
   12597   for (i = 0; i < 64; i++)
   12598     {
   12599       result <<= 1;
   12600       result |= (value & 1UL);
   12601       value >>= 1;
   12602     }
   12603   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   12604 }
   12605 
   12606 /* 32 bit reverse bytes.  */
   12607 static void
   12608 rev32 (sim_cpu *cpu)
   12609 {
   12610   unsigned rn = INSTR (9, 5);
   12611   unsigned rd = INSTR (4, 0);
   12612   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   12613   uint32_t result = 0;
   12614   int i;
   12615 
   12616   for (i = 0; i < 4; i++)
   12617     {
   12618       result <<= 8;
   12619       result |= (value & 0xff);
   12620       value >>= 8;
   12621     }
   12622   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   12623 }
   12624 
   12625 /* 64 bit reverse bytes.  */
   12626 static void
   12627 rev64 (sim_cpu *cpu)
   12628 {
   12629   unsigned rn = INSTR (9, 5);
   12630   unsigned rd = INSTR (4, 0);
   12631   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   12632   uint64_t result = 0;
   12633   int i;
   12634 
   12635   for (i = 0; i < 8; i++)
   12636     {
   12637       result <<= 8;
   12638       result |= (value & 0xffULL);
   12639       value >>= 8;
   12640     }
   12641   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   12642 }
   12643 
   12644 /* 32 bit reverse shorts.  */
   12645 /* N.B.this reverses the order of the bytes in each half word.  */
   12646 static void
   12647 revh32 (sim_cpu *cpu)
   12648 {
   12649   unsigned rn = INSTR (9, 5);
   12650   unsigned rd = INSTR (4, 0);
   12651   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   12652   uint32_t result = 0;
   12653   int i;
   12654 
   12655   for (i = 0; i < 2; i++)
   12656     {
   12657       result <<= 8;
   12658       result |= (value & 0x00ff00ff);
   12659       value >>= 8;
   12660     }
   12661   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   12662 }
   12663 
   12664 /* 64 bit reverse shorts.  */
   12665 /* N.B.this reverses the order of the bytes in each half word.  */
   12666 static void
   12667 revh64 (sim_cpu *cpu)
   12668 {
   12669   unsigned rn = INSTR (9, 5);
   12670   unsigned rd = INSTR (4, 0);
   12671   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   12672   uint64_t result = 0;
   12673   int i;
   12674 
   12675   for (i = 0; i < 2; i++)
   12676     {
   12677       result <<= 8;
   12678       result |= (value & 0x00ff00ff00ff00ffULL);
   12679       value >>= 8;
   12680     }
   12681   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   12682 }
   12683 
   12684 static void
   12685 dexDataProc1Source (sim_cpu *cpu)
   12686 {
   12687   /* instr[30]    = 1
   12688      instr[28,21] = 111010110
   12689      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
   12690      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
   12691      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
   12692      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
   12693                              000010 ==> REV, 000011 ==> UNALLOC
   12694                              000100 ==> CLZ, 000101 ==> CLS
   12695                              ow ==> UNALLOC
   12696      instr[9,5]   = rn : may not be SP
   12697      instr[4,0]   = rd : may not be SP.  */
   12698 
   12699   uint32_t S = INSTR (29, 29);
   12700   uint32_t opcode2 = INSTR (20, 16);
   12701   uint32_t opcode = INSTR (15, 10);
   12702   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
   12703 
   12704   if (S == 1)
   12705     HALT_UNALLOC;
   12706 
   12707   if (opcode2 != 0)
   12708     HALT_UNALLOC;
   12709 
   12710   if (opcode & 0x38)
   12711     HALT_UNALLOC;
   12712 
   12713   switch (dispatch)
   12714     {
   12715     case 0: rbit32 (cpu); return;
   12716     case 1: revh32 (cpu); return;
   12717     case 2: rev32 (cpu); return;
   12718     case 4: clz32 (cpu); return;
   12719     case 5: cls32 (cpu); return;
   12720     case 8: rbit64 (cpu); return;
   12721     case 9: revh64 (cpu); return;
   12722     case 10:rev32 (cpu); return;
   12723     case 11:rev64 (cpu); return;
   12724     case 12:clz64 (cpu); return;
   12725     case 13:cls64 (cpu); return;
   12726     default: HALT_UNALLOC;
   12727     }
   12728 }
   12729 
   12730 /* Variable shift.
   12731    Shifts by count supplied in register.
   12732    N.B register args may not be SP.
   12733    These all use the shifted auxiliary function for
   12734    simplicity and clarity.  Writing the actual shift
   12735    inline would avoid a branch and so be faster but
   12736    would also necessitate getting signs right.  */
   12737 
   12738 /* 32 bit arithmetic shift right.  */
   12739 static void
   12740 asrv32 (sim_cpu *cpu)
   12741 {
   12742   unsigned rm = INSTR (20, 16);
   12743   unsigned rn = INSTR (9, 5);
   12744   unsigned rd = INSTR (4, 0);
   12745 
   12746   aarch64_set_reg_u64
   12747     (cpu, rd, NO_SP,
   12748      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
   12749 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
   12750 }
   12751 
   12752 /* 64 bit arithmetic shift right.  */
   12753 static void
   12754 asrv64 (sim_cpu *cpu)
   12755 {
   12756   unsigned rm = INSTR (20, 16);
   12757   unsigned rn = INSTR (9, 5);
   12758   unsigned rd = INSTR (4, 0);
   12759 
   12760   aarch64_set_reg_u64
   12761     (cpu, rd, NO_SP,
   12762      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
   12763 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
   12764 }
   12765 
   12766 /* 32 bit logical shift left.  */
   12767 static void
   12768 lslv32 (sim_cpu *cpu)
   12769 {
   12770   unsigned rm = INSTR (20, 16);
   12771   unsigned rn = INSTR (9, 5);
   12772   unsigned rd = INSTR (4, 0);
   12773 
   12774   aarch64_set_reg_u64
   12775     (cpu, rd, NO_SP,
   12776      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
   12777 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
   12778 }
   12779 
   12780 /* 64 bit arithmetic shift left.  */
   12781 static void
   12782 lslv64 (sim_cpu *cpu)
   12783 {
   12784   unsigned rm = INSTR (20, 16);
   12785   unsigned rn = INSTR (9, 5);
   12786   unsigned rd = INSTR (4, 0);
   12787 
   12788   aarch64_set_reg_u64
   12789     (cpu, rd, NO_SP,
   12790      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
   12791 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
   12792 }
   12793 
   12794 /* 32 bit logical shift right.  */
   12795 static void
   12796 lsrv32 (sim_cpu *cpu)
   12797 {
   12798   unsigned rm = INSTR (20, 16);
   12799   unsigned rn = INSTR (9, 5);
   12800   unsigned rd = INSTR (4, 0);
   12801 
   12802   aarch64_set_reg_u64
   12803     (cpu, rd, NO_SP,
   12804      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
   12805 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
   12806 }
   12807 
   12808 /* 64 bit logical shift right.  */
   12809 static void
   12810 lsrv64 (sim_cpu *cpu)
   12811 {
   12812   unsigned rm = INSTR (20, 16);
   12813   unsigned rn = INSTR (9, 5);
   12814   unsigned rd = INSTR (4, 0);
   12815 
   12816   aarch64_set_reg_u64
   12817     (cpu, rd, NO_SP,
   12818      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
   12819 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
   12820 }
   12821 
   12822 /* 32 bit rotate right.  */
   12823 static void
   12824 rorv32 (sim_cpu *cpu)
   12825 {
   12826   unsigned rm = INSTR (20, 16);
   12827   unsigned rn = INSTR (9, 5);
   12828   unsigned rd = INSTR (4, 0);
   12829 
   12830   aarch64_set_reg_u64
   12831     (cpu, rd, NO_SP,
   12832      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
   12833 		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
   12834 }
   12835 
   12836 /* 64 bit rotate right.  */
   12837 static void
   12838 rorv64 (sim_cpu *cpu)
   12839 {
   12840   unsigned rm = INSTR (20, 16);
   12841   unsigned rn = INSTR (9, 5);
   12842   unsigned rd = INSTR (4, 0);
   12843 
   12844   aarch64_set_reg_u64
   12845     (cpu, rd, NO_SP,
   12846      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
   12847 		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
   12848 }
   12849 
   12850 
   12851 /* divide.  */
   12852 
   12853 /* 32 bit signed divide.  */
   12854 static void
   12855 cpuiv32 (sim_cpu *cpu)
   12856 {
   12857   unsigned rm = INSTR (20, 16);
   12858   unsigned rn = INSTR (9, 5);
   12859   unsigned rd = INSTR (4, 0);
   12860   /* N.B. the pseudo-code does the divide using 64 bit data.  */
   12861   /* TODO : check that this rounds towards zero as required.  */
   12862   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
   12863   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
   12864 
   12865   aarch64_set_reg_s64 (cpu, rd, NO_SP,
   12866 		       divisor ? ((int32_t) (dividend / divisor)) : 0);
   12867 }
   12868 
   12869 /* 64 bit signed divide.  */
   12870 static void
   12871 cpuiv64 (sim_cpu *cpu)
   12872 {
   12873   unsigned rm = INSTR (20, 16);
   12874   unsigned rn = INSTR (9, 5);
   12875   unsigned rd = INSTR (4, 0);
   12876 
   12877   /* TODO : check that this rounds towards zero as required.  */
   12878   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
   12879 
   12880   aarch64_set_reg_s64
   12881     (cpu, rd, NO_SP,
   12882      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
   12883 }
   12884 
   12885 /* 32 bit unsigned divide.  */
   12886 static void
   12887 udiv32 (sim_cpu *cpu)
   12888 {
   12889   unsigned rm = INSTR (20, 16);
   12890   unsigned rn = INSTR (9, 5);
   12891   unsigned rd = INSTR (4, 0);
   12892 
   12893   /* N.B. the pseudo-code does the divide using 64 bit data.  */
   12894   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   12895   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
   12896 
   12897   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12898 		       divisor ? (uint32_t) (dividend / divisor) : 0);
   12899 }
   12900 
   12901 /* 64 bit unsigned divide.  */
   12902 static void
   12903 udiv64 (sim_cpu *cpu)
   12904 {
   12905   unsigned rm = INSTR (20, 16);
   12906   unsigned rn = INSTR (9, 5);
   12907   unsigned rd = INSTR (4, 0);
   12908 
   12909   /* TODO : check that this rounds towards zero as required.  */
   12910   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
   12911 
   12912   aarch64_set_reg_u64
   12913     (cpu, rd, NO_SP,
   12914      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
   12915 }
   12916 
   12917 static void
   12918 dexDataProc2Source (sim_cpu *cpu)
   12919 {
   12920   /* assert instr[30] == 0
   12921      instr[28,21] == 11010110
   12922      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
   12923      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
   12924      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
   12925                              001000 ==> LSLV, 001001 ==> LSRV
   12926                              001010 ==> ASRV, 001011 ==> RORV
   12927                              ow ==> UNALLOC.  */
   12928 
   12929   uint32_t dispatch;
   12930   uint32_t S = INSTR (29, 29);
   12931   uint32_t opcode = INSTR (15, 10);
   12932 
   12933   if (S == 1)
   12934     HALT_UNALLOC;
   12935 
   12936   if (opcode & 0x34)
   12937     HALT_UNALLOC;
   12938 
   12939   dispatch = (  (INSTR (31, 31) << 3)
   12940 	      | (uimm (opcode, 3, 3) << 2)
   12941 	      |  uimm (opcode, 1, 0));
   12942   switch (dispatch)
   12943     {
   12944     case 2:  udiv32 (cpu); return;
   12945     case 3:  cpuiv32 (cpu); return;
   12946     case 4:  lslv32 (cpu); return;
   12947     case 5:  lsrv32 (cpu); return;
   12948     case 6:  asrv32 (cpu); return;
   12949     case 7:  rorv32 (cpu); return;
   12950     case 10: udiv64 (cpu); return;
   12951     case 11: cpuiv64 (cpu); return;
   12952     case 12: lslv64 (cpu); return;
   12953     case 13: lsrv64 (cpu); return;
   12954     case 14: asrv64 (cpu); return;
   12955     case 15: rorv64 (cpu); return;
   12956     default: HALT_UNALLOC;
   12957     }
   12958 }
   12959 
   12960 
   12961 /* Multiply.  */
   12962 
   12963 /* 32 bit multiply and add.  */
   12964 static void
   12965 madd32 (sim_cpu *cpu)
   12966 {
   12967   unsigned rm = INSTR (20, 16);
   12968   unsigned ra = INSTR (14, 10);
   12969   unsigned rn = INSTR (9, 5);
   12970   unsigned rd = INSTR (4, 0);
   12971 
   12972   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   12973   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12974 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
   12975 		       + aarch64_get_reg_u32 (cpu, rn, NO_SP)
   12976 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
   12977 }
   12978 
   12979 /* 64 bit multiply and add.  */
   12980 static void
   12981 madd64 (sim_cpu *cpu)
   12982 {
   12983   unsigned rm = INSTR (20, 16);
   12984   unsigned ra = INSTR (14, 10);
   12985   unsigned rn = INSTR (9, 5);
   12986   unsigned rd = INSTR (4, 0);
   12987 
   12988   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   12989   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   12990 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
   12991 		       + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
   12992 			  * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
   12993 }
   12994 
   12995 /* 32 bit multiply and sub.  */
   12996 static void
   12997 msub32 (sim_cpu *cpu)
   12998 {
   12999   unsigned rm = INSTR (20, 16);
   13000   unsigned ra = INSTR (14, 10);
   13001   unsigned rn = INSTR (9, 5);
   13002   unsigned rd = INSTR (4, 0);
   13003 
   13004   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13005   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   13006 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
   13007 		       - aarch64_get_reg_u32 (cpu, rn, NO_SP)
   13008 		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
   13009 }
   13010 
   13011 /* 64 bit multiply and sub.  */
   13012 static void
   13013 msub64 (sim_cpu *cpu)
   13014 {
   13015   unsigned rm = INSTR (20, 16);
   13016   unsigned ra = INSTR (14, 10);
   13017   unsigned rn = INSTR (9, 5);
   13018   unsigned rd = INSTR (4, 0);
   13019 
   13020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13021   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   13022 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
   13023 		       - aarch64_get_reg_u64 (cpu, rn, NO_SP)
   13024 		       * aarch64_get_reg_u64 (cpu, rm, NO_SP));
   13025 }
   13026 
   13027 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
   13028 static void
   13029 smaddl (sim_cpu *cpu)
   13030 {
   13031   unsigned rm = INSTR (20, 16);
   13032   unsigned ra = INSTR (14, 10);
   13033   unsigned rn = INSTR (9, 5);
   13034   unsigned rd = INSTR (4, 0);
   13035 
   13036   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
   13037      obtain a 64 bit product.  */
   13038   aarch64_set_reg_s64
   13039     (cpu, rd, NO_SP,
   13040      aarch64_get_reg_s64 (cpu, ra, NO_SP)
   13041      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
   13042      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
   13043 }
   13044 
   13045 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
   13046 static void
   13047 smsubl (sim_cpu *cpu)
   13048 {
   13049   unsigned rm = INSTR (20, 16);
   13050   unsigned ra = INSTR (14, 10);
   13051   unsigned rn = INSTR (9, 5);
   13052   unsigned rd = INSTR (4, 0);
   13053 
   13054   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
   13055      obtain a 64 bit product.  */
   13056   aarch64_set_reg_s64
   13057     (cpu, rd, NO_SP,
   13058      aarch64_get_reg_s64 (cpu, ra, NO_SP)
   13059      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
   13060      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
   13061 }
   13062 
   13063 /* Integer Multiply/Divide.  */
   13064 
   13065 /* First some macros and a helper function.  */
   13066 /* Macros to test or access elements of 64 bit words.  */
   13067 
   13068 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
   13069 #define LOW_WORD_MASK ((1ULL << 32) - 1)
   13070 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
   13071 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
   13072 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
   13073 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
   13074 
   13075 /* Offset of sign bit in 64 bit signed integger.  */
   13076 #define SIGN_SHIFT_U64 63
   13077 /* The sign bit itself -- also identifies the minimum negative int value.  */
   13078 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
   13079 /* Return true if a 64 bit signed int presented as an unsigned int is the
   13080    most negative value.  */
   13081 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
   13082 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
   13083    int has its sign bit set to false.  */
   13084 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
   13085 /* Return 1L or -1L according to whether a 64 bit signed int presented as
   13086    an unsigned int has its sign bit set or not.  */
   13087 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
   13088 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
   13089 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
   13090 
   13091 /* Multiply two 64 bit ints and return.
   13092    the hi 64 bits of the 128 bit product.  */
   13093 
   13094 static uint64_t
   13095 mul64hi (uint64_t value1, uint64_t value2)
   13096 {
   13097   uint64_t resultmid1;
   13098   uint64_t result;
   13099   uint64_t value1_lo = lowWordToU64 (value1);
   13100   uint64_t value1_hi = highWordToU64 (value1) ;
   13101   uint64_t value2_lo = lowWordToU64 (value2);
   13102   uint64_t value2_hi = highWordToU64 (value2);
   13103 
   13104   /* Cross-multiply and collect results.  */
   13105   uint64_t xproductlo = value1_lo * value2_lo;
   13106   uint64_t xproductmid1 = value1_lo * value2_hi;
   13107   uint64_t xproductmid2 = value1_hi * value2_lo;
   13108   uint64_t xproducthi = value1_hi * value2_hi;
   13109   uint64_t carry = 0;
   13110   /* Start accumulating 64 bit results.  */
   13111   /* Drop bottom half of lowest cross-product.  */
   13112   uint64_t resultmid = xproductlo >> 32;
   13113   /* Add in middle products.  */
   13114   resultmid = resultmid + xproductmid1;
   13115 
   13116   /* Check for overflow.  */
   13117   if (resultmid < xproductmid1)
   13118     /* Carry over 1 into top cross-product.  */
   13119     carry++;
   13120 
   13121   resultmid1  = resultmid + xproductmid2;
   13122 
   13123   /* Check for overflow.  */
   13124   if (resultmid1 < xproductmid2)
   13125     /* Carry over 1 into top cross-product.  */
   13126     carry++;
   13127 
   13128   /* Drop lowest 32 bits of middle cross-product.  */
   13129   result = resultmid1 >> 32;
   13130   /* Move carry bit to just above middle cross-product highest bit.  */
   13131   carry = carry << 32;
   13132 
   13133   /* Add top cross-product plus and any carry.  */
   13134   result += xproducthi + carry;
   13135 
   13136   return result;
   13137 }
   13138 
   13139 /* Signed multiply high, source, source2 :
   13140    64 bit, dest <-- high 64-bit of result.  */
   13141 static void
   13142 smulh (sim_cpu *cpu)
   13143 {
   13144   uint64_t uresult;
   13145   int64_t  result;
   13146   unsigned rm = INSTR (20, 16);
   13147   unsigned rn = INSTR (9, 5);
   13148   unsigned rd = INSTR (4, 0);
   13149   GReg     ra = INSTR (14, 10);
   13150   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   13151   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
   13152   uint64_t uvalue1;
   13153   uint64_t uvalue2;
   13154   int  negate = 0;
   13155 
   13156   if (ra != R31)
   13157     HALT_UNALLOC;
   13158 
   13159   /* Convert to unsigned and use the unsigned mul64hi routine
   13160      the fix the sign up afterwards.  */
   13161   if (value1 < 0)
   13162     {
   13163       negate = !negate;
   13164       uvalue1 = -value1;
   13165     }
   13166   else
   13167     {
   13168       uvalue1 = value1;
   13169     }
   13170 
   13171   if (value2 < 0)
   13172     {
   13173       negate = !negate;
   13174       uvalue2 = -value2;
   13175     }
   13176   else
   13177     {
   13178       uvalue2 = value2;
   13179     }
   13180 
   13181   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13182 
   13183   uresult = mul64hi (uvalue1, uvalue2);
   13184   result = uresult;
   13185 
   13186   if (negate)
   13187     {
   13188       /* Multiply 128-bit result by -1, which means highpart gets inverted,
   13189 	 and has carry in added only if low part is 0.  */
   13190       result = ~result;
   13191       if ((uvalue1 * uvalue2) == 0)
   13192 	result += 1;
   13193     }
   13194 
   13195   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
   13196 }
   13197 
   13198 /* Unsigned multiply add long -- source, source2 :
   13199    32 bit, source3 : 64 bit.  */
   13200 static void
   13201 umaddl (sim_cpu *cpu)
   13202 {
   13203   unsigned rm = INSTR (20, 16);
   13204   unsigned ra = INSTR (14, 10);
   13205   unsigned rn = INSTR (9, 5);
   13206   unsigned rd = INSTR (4, 0);
   13207 
   13208   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13209   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
   13210      obtain a 64 bit product.  */
   13211   aarch64_set_reg_u64
   13212     (cpu, rd, NO_SP,
   13213      aarch64_get_reg_u64 (cpu, ra, NO_SP)
   13214      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
   13215      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
   13216 }
   13217 
   13218 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
   13219 static void
   13220 umsubl (sim_cpu *cpu)
   13221 {
   13222   unsigned rm = INSTR (20, 16);
   13223   unsigned ra = INSTR (14, 10);
   13224   unsigned rn = INSTR (9, 5);
   13225   unsigned rd = INSTR (4, 0);
   13226 
   13227   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13228   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
   13229      obtain a 64 bit product.  */
   13230   aarch64_set_reg_u64
   13231     (cpu, rd, NO_SP,
   13232      aarch64_get_reg_u64 (cpu, ra, NO_SP)
   13233      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
   13234      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
   13235 }
   13236 
   13237 /* Unsigned multiply high, source, source2 :
   13238    64 bit, dest <-- high 64-bit of result.  */
   13239 static void
   13240 umulh (sim_cpu *cpu)
   13241 {
   13242   unsigned rm = INSTR (20, 16);
   13243   unsigned rn = INSTR (9, 5);
   13244   unsigned rd = INSTR (4, 0);
   13245   GReg     ra = INSTR (14, 10);
   13246 
   13247   if (ra != R31)
   13248     HALT_UNALLOC;
   13249 
   13250   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13251   aarch64_set_reg_u64 (cpu, rd, NO_SP,
   13252 		       mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
   13253 				aarch64_get_reg_u64 (cpu, rm, NO_SP)));
   13254 }
   13255 
   13256 static void
   13257 dexDataProc3Source (sim_cpu *cpu)
   13258 {
   13259   /* assert instr[28,24] == 11011.  */
   13260   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
   13261      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
   13262      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
   13263      instr[15] = o0 : 0/1 ==> ok
   13264      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
   13265                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
   13266                               0100 ==> SMULH,                   (64 bit only)
   13267                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
   13268                               1100 ==> UMULH                    (64 bit only)
   13269                               ow ==> UNALLOC.  */
   13270 
   13271   uint32_t dispatch;
   13272   uint32_t size = INSTR (31, 31);
   13273   uint32_t op54 = INSTR (30, 29);
   13274   uint32_t op31 = INSTR (23, 21);
   13275   uint32_t o0 = INSTR (15, 15);
   13276 
   13277   if (op54 != 0)
   13278     HALT_UNALLOC;
   13279 
   13280   if (size == 0)
   13281     {
   13282       if (op31 != 0)
   13283 	HALT_UNALLOC;
   13284 
   13285       if (o0 == 0)
   13286 	madd32 (cpu);
   13287       else
   13288 	msub32 (cpu);
   13289       return;
   13290     }
   13291 
   13292   dispatch = (op31 << 1) | o0;
   13293 
   13294   switch (dispatch)
   13295     {
   13296     case 0:  madd64 (cpu); return;
   13297     case 1:  msub64 (cpu); return;
   13298     case 2:  smaddl (cpu); return;
   13299     case 3:  smsubl (cpu); return;
   13300     case 4:  smulh (cpu); return;
   13301     case 10: umaddl (cpu); return;
   13302     case 11: umsubl (cpu); return;
   13303     case 12: umulh (cpu); return;
   13304     default: HALT_UNALLOC;
   13305     }
   13306 }
   13307 
   13308 static void
   13309 dexDPReg (sim_cpu *cpu)
   13310 {
   13311   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
   13312      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
   13313      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
   13314   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
   13315 
   13316   switch (group2)
   13317     {
   13318     case DPREG_LOG_000:
   13319     case DPREG_LOG_001:
   13320       dexLogicalShiftedRegister (cpu); return;
   13321 
   13322     case DPREG_ADDSHF_010:
   13323       dexAddSubtractShiftedRegister (cpu); return;
   13324 
   13325     case DPREG_ADDEXT_011:
   13326       dexAddSubtractExtendedRegister (cpu); return;
   13327 
   13328     case DPREG_ADDCOND_100:
   13329       {
   13330 	/* This set bundles a variety of different operations.  */
   13331 	/* Check for.  */
   13332 	/* 1) add/sub w carry.  */
   13333 	uint32_t mask1 = 0x1FE00000U;
   13334 	uint32_t val1  = 0x1A000000U;
   13335 	/* 2) cond compare register/immediate.  */
   13336 	uint32_t mask2 = 0x1FE00000U;
   13337 	uint32_t val2  = 0x1A400000U;
   13338 	/* 3) cond select.  */
   13339 	uint32_t mask3 = 0x1FE00000U;
   13340 	uint32_t val3  = 0x1A800000U;
   13341 	/* 4) data proc 1/2 source.  */
   13342 	uint32_t mask4 = 0x1FE00000U;
   13343 	uint32_t val4  = 0x1AC00000U;
   13344 
   13345 	if ((aarch64_get_instr (cpu) & mask1) == val1)
   13346 	  dexAddSubtractWithCarry (cpu);
   13347 
   13348 	else if ((aarch64_get_instr (cpu) & mask2) == val2)
   13349 	  CondCompare (cpu);
   13350 
   13351 	else if ((aarch64_get_instr (cpu) & mask3) == val3)
   13352 	  dexCondSelect (cpu);
   13353 
   13354 	else if ((aarch64_get_instr (cpu) & mask4) == val4)
   13355 	  {
   13356 	    /* Bit 30 is clear for data proc 2 source
   13357 	       and set for data proc 1 source.  */
   13358 	    if (aarch64_get_instr (cpu)  & (1U << 30))
   13359 	      dexDataProc1Source (cpu);
   13360 	    else
   13361 	      dexDataProc2Source (cpu);
   13362 	  }
   13363 
   13364 	else
   13365 	  /* Should not reach here.  */
   13366 	  HALT_NYI;
   13367 
   13368 	return;
   13369       }
   13370 
   13371     case DPREG_3SRC_110:
   13372       dexDataProc3Source (cpu); return;
   13373 
   13374     case DPREG_UNALLOC_101:
   13375       HALT_UNALLOC;
   13376 
   13377     case DPREG_3SRC_111:
   13378       dexDataProc3Source (cpu); return;
   13379 
   13380     default:
   13381       /* Should never reach here.  */
   13382       HALT_NYI;
   13383     }
   13384 }
   13385 
   13386 /* Unconditional Branch immediate.
   13387    Offset is a PC-relative byte offset in the range +/- 128MiB.
   13388    The offset is assumed to be raw from the decode i.e. the
   13389    simulator is expected to scale them from word offsets to byte.  */
   13390 
   13391 /* Unconditional branch.  */
   13392 static void
   13393 buc (sim_cpu *cpu, int32_t offset)
   13394 {
   13395   aarch64_set_next_PC_by_offset (cpu, offset);
   13396 }
   13397 
   13398 static unsigned stack_depth = 0;
   13399 
   13400 /* Unconditional branch and link -- writes return PC to LR.  */
   13401 static void
   13402 bl (sim_cpu *cpu, int32_t offset)
   13403 {
   13404   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13405   aarch64_save_LR (cpu);
   13406   aarch64_set_next_PC_by_offset (cpu, offset);
   13407 
   13408   if (TRACE_BRANCH_P (cpu))
   13409     {
   13410       ++ stack_depth;
   13411       TRACE_BRANCH (cpu,
   13412 		    " %*scall %" PRIx64 " [%s]"
   13413 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
   13414 		    stack_depth, " ", aarch64_get_next_PC (cpu),
   13415 		    aarch64_get_func (CPU_STATE (cpu),
   13416 				      aarch64_get_next_PC (cpu)),
   13417 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
   13418 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
   13419 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
   13420 		    );
   13421     }
   13422 }
   13423 
   13424 /* Unconditional Branch register.
   13425    Branch/return address is in source register.  */
   13426 
   13427 /* Unconditional branch.  */
   13428 static void
   13429 br (sim_cpu *cpu)
   13430 {
   13431   unsigned rn = INSTR (9, 5);
   13432   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13433   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
   13434 }
   13435 
   13436 /* Unconditional branch and link -- writes return PC to LR.  */
   13437 static void
   13438 blr (sim_cpu *cpu)
   13439 {
   13440   /* Ensure we read the destination before we write LR.  */
   13441   uint64_t target = aarch64_get_reg_u64 (cpu, INSTR (9, 5), NO_SP);
   13442 
   13443   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13444   aarch64_save_LR (cpu);
   13445   aarch64_set_next_PC (cpu, target);
   13446 
   13447   if (TRACE_BRANCH_P (cpu))
   13448     {
   13449       ++ stack_depth;
   13450       TRACE_BRANCH (cpu,
   13451 		    " %*scall %" PRIx64 " [%s]"
   13452 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
   13453 		    stack_depth, " ", aarch64_get_next_PC (cpu),
   13454 		    aarch64_get_func (CPU_STATE (cpu),
   13455 				      aarch64_get_next_PC (cpu)),
   13456 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
   13457 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
   13458 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
   13459 		    );
   13460     }
   13461 }
   13462 
   13463 /* Return -- assembler will default source to LR this is functionally
   13464    equivalent to br but, presumably, unlike br it side effects the
   13465    branch predictor.  */
   13466 static void
   13467 ret (sim_cpu *cpu)
   13468 {
   13469   unsigned rn = INSTR (9, 5);
   13470   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
   13471 
   13472   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13473   if (TRACE_BRANCH_P (cpu))
   13474     {
   13475       TRACE_BRANCH (cpu,
   13476 		    " %*sreturn [result: %" PRIx64 "]",
   13477 		    stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
   13478       -- stack_depth;
   13479     }
   13480 }
   13481 
   13482 /* NOP -- we implement this and call it from the decode in case we
   13483    want to intercept it later.  */
   13484 
   13485 static void
   13486 nop (sim_cpu *cpu)
   13487 {
   13488   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13489 }
   13490 
   13491 /* Data synchronization barrier.  */
   13492 
   13493 static void
   13494 dsb (sim_cpu *cpu)
   13495 {
   13496   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13497 }
   13498 
   13499 /* Data memory barrier.  */
   13500 
   13501 static void
   13502 dmb (sim_cpu *cpu)
   13503 {
   13504   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13505 }
   13506 
   13507 /* Instruction synchronization barrier.  */
   13508 
   13509 static void
   13510 isb (sim_cpu *cpu)
   13511 {
   13512   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13513 }
   13514 
   13515 static void
   13516 dexBranchImmediate (sim_cpu *cpu)
   13517 {
   13518   /* assert instr[30,26] == 00101
   13519      instr[31] ==> 0 == B, 1 == BL
   13520      instr[25,0] == imm26 branch offset counted in words.  */
   13521 
   13522   uint32_t top = INSTR (31, 31);
   13523   /* We have a 26 byte signed word offset which we need to pass to the
   13524      execute routine as a signed byte offset.  */
   13525   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
   13526 
   13527   if (top)
   13528     bl (cpu, offset);
   13529   else
   13530     buc (cpu, offset);
   13531 }
   13532 
   13533 /* Control Flow.  */
   13534 
   13535 /* Conditional branch
   13536 
   13537    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
   13538    a bit position in the range 0 .. 63
   13539 
   13540    cc is a CondCode enum value as pulled out of the decode
   13541 
   13542    N.B. any offset register (source) can only be Xn or Wn.  */
   13543 
   13544 static void
   13545 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
   13546 {
   13547   /* The test returns TRUE if CC is met.  */
   13548   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13549   if (testConditionCode (cpu, cc))
   13550     aarch64_set_next_PC_by_offset (cpu, offset);
   13551 }
   13552 
   13553 /* 32 bit branch on register non-zero.  */
   13554 static void
   13555 cbnz32 (sim_cpu *cpu, int32_t offset)
   13556 {
   13557   unsigned rt = INSTR (4, 0);
   13558 
   13559   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13560   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
   13561     aarch64_set_next_PC_by_offset (cpu, offset);
   13562 }
   13563 
   13564 /* 64 bit branch on register zero.  */
   13565 static void
   13566 cbnz (sim_cpu *cpu, int32_t offset)
   13567 {
   13568   unsigned rt = INSTR (4, 0);
   13569 
   13570   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13571   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
   13572     aarch64_set_next_PC_by_offset (cpu, offset);
   13573 }
   13574 
   13575 /* 32 bit branch on register non-zero.  */
   13576 static void
   13577 cbz32 (sim_cpu *cpu, int32_t offset)
   13578 {
   13579   unsigned rt = INSTR (4, 0);
   13580 
   13581   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13582   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
   13583     aarch64_set_next_PC_by_offset (cpu, offset);
   13584 }
   13585 
   13586 /* 64 bit branch on register zero.  */
   13587 static void
   13588 cbz (sim_cpu *cpu, int32_t offset)
   13589 {
   13590   unsigned rt = INSTR (4, 0);
   13591 
   13592   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13593   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
   13594     aarch64_set_next_PC_by_offset (cpu, offset);
   13595 }
   13596 
   13597 /* Branch on register bit test non-zero -- one size fits all.  */
   13598 static void
   13599 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
   13600 {
   13601   unsigned rt = INSTR (4, 0);
   13602 
   13603   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13604   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
   13605     aarch64_set_next_PC_by_offset (cpu, offset);
   13606 }
   13607 
   13608 /* Branch on register bit test zero -- one size fits all.  */
   13609 static void
   13610 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
   13611 {
   13612   unsigned rt = INSTR (4, 0);
   13613 
   13614   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13615   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
   13616     aarch64_set_next_PC_by_offset (cpu, offset);
   13617 }
   13618 
   13619 static void
   13620 dexCompareBranchImmediate (sim_cpu *cpu)
   13621 {
   13622   /* instr[30,25] = 01 1010
   13623      instr[31]    = size : 0 ==> 32, 1 ==> 64
   13624      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
   13625      instr[23,5]  = simm19 branch offset counted in words
   13626      instr[4,0]   = rt  */
   13627 
   13628   uint32_t size = INSTR (31, 31);
   13629   uint32_t op   = INSTR (24, 24);
   13630   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
   13631 
   13632   if (size == 0)
   13633     {
   13634       if (op == 0)
   13635 	cbz32 (cpu, offset);
   13636       else
   13637 	cbnz32 (cpu, offset);
   13638     }
   13639   else
   13640     {
   13641       if (op == 0)
   13642 	cbz (cpu, offset);
   13643       else
   13644 	cbnz (cpu, offset);
   13645     }
   13646 }
   13647 
   13648 static void
   13649 dexTestBranchImmediate (sim_cpu *cpu)
   13650 {
   13651   /* instr[31]    = b5 : bit 5 of test bit idx
   13652      instr[30,25] = 01 1011
   13653      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
   13654      instr[23,19] = b40 : bits 4 to 0 of test bit idx
   13655      instr[18,5]  = simm14 : signed offset counted in words
   13656      instr[4,0]   = uimm5  */
   13657 
   13658   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
   13659   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
   13660 
   13661   NYI_assert (30, 25, 0x1b);
   13662 
   13663   if (INSTR (24, 24) == 0)
   13664     tbz (cpu, pos, offset);
   13665   else
   13666     tbnz (cpu, pos, offset);
   13667 }
   13668 
   13669 static void
   13670 dexCondBranchImmediate (sim_cpu *cpu)
   13671 {
   13672   /* instr[31,25] = 010 1010
   13673      instr[24]    = op1; op => 00 ==> B.cond
   13674      instr[23,5]  = simm19 : signed offset counted in words
   13675      instr[4]     = op0
   13676      instr[3,0]   = cond  */
   13677 
   13678   int32_t offset;
   13679   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
   13680 
   13681   NYI_assert (31, 25, 0x2a);
   13682 
   13683   if (op != 0)
   13684     HALT_UNALLOC;
   13685 
   13686   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
   13687 
   13688   bcc (cpu, offset, INSTR (3, 0));
   13689 }
   13690 
   13691 static void
   13692 dexBranchRegister (sim_cpu *cpu)
   13693 {
   13694   /* instr[31,25] = 110 1011
   13695      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
   13696      instr[20,16] = op2 : must be 11111
   13697      instr[15,10] = op3 : must be 000000
   13698      instr[4,0]   = op2 : must be 11111.  */
   13699 
   13700   uint32_t op = INSTR (24, 21);
   13701   uint32_t op2 = INSTR (20, 16);
   13702   uint32_t op3 = INSTR (15, 10);
   13703   uint32_t op4 = INSTR (4, 0);
   13704 
   13705   NYI_assert (31, 25, 0x6b);
   13706 
   13707   if (op2 != 0x1F || op3 != 0 || op4 != 0)
   13708     HALT_UNALLOC;
   13709 
   13710   if (op == 0)
   13711     br (cpu);
   13712 
   13713   else if (op == 1)
   13714     blr (cpu);
   13715 
   13716   else if (op == 2)
   13717     ret (cpu);
   13718 
   13719   else
   13720     {
   13721       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
   13722       /* anything else is unallocated.  */
   13723       uint32_t rn = INSTR (4, 0);
   13724 
   13725       if (rn != 0x1f)
   13726 	HALT_UNALLOC;
   13727 
   13728       if (op == 4 || op == 5)
   13729 	HALT_NYI;
   13730 
   13731       HALT_UNALLOC;
   13732     }
   13733 }
   13734 
   13735 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
   13736    but this may not be available.  So instead we define the values we need
   13737    here.  */
   13738 #define AngelSVC_Reason_Open		0x01
   13739 #define AngelSVC_Reason_Close		0x02
   13740 #define AngelSVC_Reason_Write		0x05
   13741 #define AngelSVC_Reason_Read		0x06
   13742 #define AngelSVC_Reason_IsTTY		0x09
   13743 #define AngelSVC_Reason_Seek		0x0A
   13744 #define AngelSVC_Reason_FLen		0x0C
   13745 #define AngelSVC_Reason_Remove		0x0E
   13746 #define AngelSVC_Reason_Rename		0x0F
   13747 #define AngelSVC_Reason_Clock		0x10
   13748 #define AngelSVC_Reason_Time		0x11
   13749 #define AngelSVC_Reason_System		0x12
   13750 #define AngelSVC_Reason_Errno		0x13
   13751 #define AngelSVC_Reason_GetCmdLine	0x15
   13752 #define AngelSVC_Reason_HeapInfo	0x16
   13753 #define AngelSVC_Reason_ReportException 0x18
   13754 #define AngelSVC_Reason_Elapsed         0x30
   13755 
   13756 
   13757 static void
   13758 handle_halt (sim_cpu *cpu, uint32_t val)
   13759 {
   13760   uint64_t result = 0;
   13761 
   13762   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   13763   if (val != 0xf000)
   13764     {
   13765       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
   13766       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
   13767 		       sim_stopped, SIM_SIGTRAP);
   13768     }
   13769 
   13770   /* We have encountered an Angel SVC call.  See if we can process it.  */
   13771   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
   13772     {
   13773     case AngelSVC_Reason_HeapInfo:
   13774       {
   13775 	/* Get the values.  */
   13776 	uint64_t stack_top = aarch64_get_stack_start (cpu);
   13777 	uint64_t heap_base = aarch64_get_heap_start (cpu);
   13778 
   13779 	/* Get the pointer  */
   13780 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
   13781 	ptr = aarch64_get_mem_u64 (cpu, ptr);
   13782 
   13783 	/* Fill in the memory block.  */
   13784 	/* Start addr of heap.  */
   13785 	aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
   13786 	/* End addr of heap.  */
   13787 	aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
   13788 	/* Lowest stack addr.  */
   13789 	aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
   13790 	/* Initial stack addr.  */
   13791 	aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
   13792 
   13793 	TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
   13794       }
   13795       break;
   13796 
   13797     case AngelSVC_Reason_Open:
   13798       {
   13799 	/* Get the pointer  */
   13800 	/* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
   13801 	/* FIXME: For now we just assume that we will only be asked
   13802 	   to open the standard file descriptors.  */
   13803 	static int fd = 0;
   13804 	result = fd ++;
   13805 
   13806 	TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
   13807       }
   13808       break;
   13809 
   13810     case AngelSVC_Reason_Close:
   13811       {
   13812 	uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
   13813 	TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
   13814 	result = 0;
   13815       }
   13816       break;
   13817 
   13818     case AngelSVC_Reason_Errno:
   13819       result = 0;
   13820       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
   13821       break;
   13822 
   13823     case AngelSVC_Reason_Clock:
   13824       result =
   13825 #ifdef CLOCKS_PER_SEC
   13826 	(CLOCKS_PER_SEC >= 100)
   13827 	? (clock () / (CLOCKS_PER_SEC / 100))
   13828 	: ((clock () * 100) / CLOCKS_PER_SEC)
   13829 #else
   13830 	/* Presume unix... clock() returns microseconds.  */
   13831 	(clock () / 10000)
   13832 #endif
   13833 	;
   13834 	TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
   13835       break;
   13836 
   13837     case AngelSVC_Reason_GetCmdLine:
   13838       {
   13839 	/* Get the pointer  */
   13840 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
   13841 	ptr = aarch64_get_mem_u64 (cpu, ptr);
   13842 
   13843 	/* FIXME: No command line for now.  */
   13844 	aarch64_set_mem_u64 (cpu, ptr, 0);
   13845 	TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
   13846       }
   13847       break;
   13848 
   13849     case AngelSVC_Reason_IsTTY:
   13850       result = 1;
   13851 	TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
   13852       break;
   13853 
   13854     case AngelSVC_Reason_Write:
   13855       {
   13856 	/* Get the pointer  */
   13857 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
   13858 	/* Get the write control block.  */
   13859 	uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
   13860 	uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
   13861 	uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
   13862 
   13863 	TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
   13864 		       PRIx64 " on descriptor %" PRIx64,
   13865 		       len, buf, fd);
   13866 
   13867 	if (len > 1280)
   13868 	  {
   13869 	    TRACE_SYSCALL (cpu,
   13870 			   " AngelSVC: Write: Suspiciously long write: %ld",
   13871 			   (long) len);
   13872 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
   13873 			     sim_stopped, SIM_SIGBUS);
   13874 	  }
   13875 	else if (fd == 1)
   13876 	  {
   13877 	    printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
   13878 	  }
   13879 	else if (fd == 2)
   13880 	  {
   13881 	    TRACE (cpu, 0, "\n");
   13882 	    sim_io_eprintf (CPU_STATE (cpu), "%.*s",
   13883 			    (int) len, aarch64_get_mem_ptr (cpu, buf));
   13884 	    TRACE (cpu, 0, "\n");
   13885 	  }
   13886 	else
   13887 	  {
   13888 	    TRACE_SYSCALL (cpu,
   13889 			   " AngelSVC: Write: Unexpected file handle: %d",
   13890 			   (int) fd);
   13891 	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
   13892 			     sim_stopped, SIM_SIGABRT);
   13893 	  }
   13894       }
   13895       break;
   13896 
   13897     case AngelSVC_Reason_ReportException:
   13898       {
   13899 	/* Get the pointer  */
   13900 	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
   13901 	/*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
   13902 	uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
   13903 	uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
   13904 
   13905 	TRACE_SYSCALL (cpu,
   13906 		       "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
   13907 		       type, state);
   13908 
   13909 	if (type == 0x20026)
   13910 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
   13911 			   sim_exited, state);
   13912 	else
   13913 	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
   13914 			   sim_stopped, SIM_SIGINT);
   13915       }
   13916       break;
   13917 
   13918     case AngelSVC_Reason_Read:
   13919     case AngelSVC_Reason_FLen:
   13920     case AngelSVC_Reason_Seek:
   13921     case AngelSVC_Reason_Remove:
   13922     case AngelSVC_Reason_Time:
   13923     case AngelSVC_Reason_System:
   13924     case AngelSVC_Reason_Rename:
   13925     case AngelSVC_Reason_Elapsed:
   13926     default:
   13927       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
   13928 		     aarch64_get_reg_u32 (cpu, 0, NO_SP));
   13929       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
   13930 		       sim_stopped, SIM_SIGTRAP);
   13931     }
   13932 
   13933   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
   13934 }
   13935 
   13936 static void
   13937 dexExcpnGen (sim_cpu *cpu)
   13938 {
   13939   /* instr[31:24] = 11010100
   13940      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
   13941                           010 ==> HLT,       101 ==> DBG GEN EXCPN
   13942      instr[20,5]  = imm16
   13943      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
   13944      instr[1,0]   = LL : discriminates opc  */
   13945 
   13946   uint32_t opc = INSTR (23, 21);
   13947   uint32_t imm16 = INSTR (20, 5);
   13948   uint32_t opc2 = INSTR (4, 2);
   13949   uint32_t LL;
   13950 
   13951   NYI_assert (31, 24, 0xd4);
   13952 
   13953   if (opc2 != 0)
   13954     HALT_UNALLOC;
   13955 
   13956   LL = INSTR (1, 0);
   13957 
   13958   /* We only implement HLT and BRK for now.  */
   13959   if (opc == 1 && LL == 0)
   13960     {
   13961       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
   13962       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
   13963 		       sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
   13964     }
   13965 
   13966   if (opc == 2 && LL == 0)
   13967     handle_halt (cpu, imm16);
   13968 
   13969   else if (opc == 0 || opc == 5)
   13970     HALT_NYI;
   13971 
   13972   else
   13973     HALT_UNALLOC;
   13974 }
   13975 
   13976 /* Stub for accessing system registers.  */
   13977 
   13978 static uint64_t
   13979 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
   13980 	    unsigned crm, unsigned op2)
   13981 {
   13982   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
   13983     /* DCZID_EL0 - the Data Cache Zero ID register.
   13984        We do not support DC ZVA at the moment, so
   13985        we return a value with the disable bit set.
   13986        We implement support for the DCZID register since
   13987        it is used by the C library's memset function.  */
   13988     return ((uint64_t) 1) << 4;
   13989 
   13990   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
   13991     /* Cache Type Register.  */
   13992     return 0x80008000UL;
   13993 
   13994   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
   13995     /* TPIDR_EL0 - thread pointer id.  */
   13996     return aarch64_get_thread_id (cpu);
   13997 
   13998   if (op1 == 3 && crm == 4 && op2 == 0)
   13999     return aarch64_get_FPCR (cpu);
   14000 
   14001   if (op1 == 3 && crm == 4 && op2 == 1)
   14002     return aarch64_get_FPSR (cpu);
   14003 
   14004   else if (op1 == 3 && crm == 2 && op2 == 0)
   14005     return aarch64_get_CPSR (cpu);
   14006 
   14007   HALT_NYI;
   14008 }
   14009 
   14010 static void
   14011 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
   14012 	    unsigned crm, unsigned op2, uint64_t val)
   14013 {
   14014   if (op1 == 3 && crm == 4 && op2 == 0)
   14015     aarch64_set_FPCR (cpu, val);
   14016 
   14017   else if (op1 == 3 && crm == 4 && op2 == 1)
   14018     aarch64_set_FPSR (cpu, val);
   14019 
   14020   else if (op1 == 3 && crm == 2 && op2 == 0)
   14021     aarch64_set_CPSR (cpu, val);
   14022 
   14023   else
   14024     HALT_NYI;
   14025 }
   14026 
   14027 static void
   14028 do_mrs (sim_cpu *cpu)
   14029 {
   14030   /* instr[31:20] = 1101 0101 0001 1
   14031      instr[19]    = op0
   14032      instr[18,16] = op1
   14033      instr[15,12] = CRn
   14034      instr[11,8]  = CRm
   14035      instr[7,5]   = op2
   14036      instr[4,0]   = Rt  */
   14037   unsigned sys_op0 = INSTR (19, 19) + 2;
   14038   unsigned sys_op1 = INSTR (18, 16);
   14039   unsigned sys_crn = INSTR (15, 12);
   14040   unsigned sys_crm = INSTR (11, 8);
   14041   unsigned sys_op2 = INSTR (7, 5);
   14042   unsigned rt = INSTR (4, 0);
   14043 
   14044   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   14045   aarch64_set_reg_u64 (cpu, rt, NO_SP,
   14046 		       system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
   14047 }
   14048 
   14049 static void
   14050 do_MSR_immediate (sim_cpu *cpu)
   14051 {
   14052   /* instr[31:19] = 1101 0101 0000 0
   14053      instr[18,16] = op1
   14054      instr[15,12] = 0100
   14055      instr[11,8]  = CRm
   14056      instr[7,5]   = op2
   14057      instr[4,0]   = 1 1111  */
   14058 
   14059   unsigned op1 = INSTR (18, 16);
   14060   /*unsigned crm = INSTR (11, 8);*/
   14061   unsigned op2 = INSTR (7, 5);
   14062 
   14063   NYI_assert (31, 19, 0x1AA0);
   14064   NYI_assert (15, 12, 0x4);
   14065   NYI_assert (4,  0,  0x1F);
   14066 
   14067   if (op1 == 0)
   14068     {
   14069       if (op2 == 5)
   14070 	HALT_NYI; /* set SPSel.  */
   14071       else
   14072 	HALT_UNALLOC;
   14073     }
   14074   else if (op1 == 3)
   14075     {
   14076       if (op2 == 6)
   14077 	HALT_NYI; /* set DAIFset.  */
   14078       else if (op2 == 7)
   14079 	HALT_NYI; /* set DAIFclr.  */
   14080       else
   14081 	HALT_UNALLOC;
   14082     }
   14083   else
   14084     HALT_UNALLOC;
   14085 }
   14086 
   14087 static void
   14088 do_MSR_reg (sim_cpu *cpu)
   14089 {
   14090   /* instr[31:20] = 1101 0101 0001
   14091      instr[19]    = op0
   14092      instr[18,16] = op1
   14093      instr[15,12] = CRn
   14094      instr[11,8]  = CRm
   14095      instr[7,5]   = op2
   14096      instr[4,0]   = Rt  */
   14097 
   14098   unsigned sys_op0 = INSTR (19, 19) + 2;
   14099   unsigned sys_op1 = INSTR (18, 16);
   14100   unsigned sys_crn = INSTR (15, 12);
   14101   unsigned sys_crm = INSTR (11, 8);
   14102   unsigned sys_op2 = INSTR (7, 5);
   14103   unsigned rt = INSTR (4, 0);
   14104 
   14105   NYI_assert (31, 20, 0xD51);
   14106 
   14107   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   14108   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
   14109 	      aarch64_get_reg_u64 (cpu, rt, NO_SP));
   14110 }
   14111 
   14112 static void
   14113 do_SYS (sim_cpu *cpu)
   14114 {
   14115   /* instr[31,19] = 1101 0101 0000 1
   14116      instr[18,16] = op1
   14117      instr[15,12] = CRn
   14118      instr[11,8]  = CRm
   14119      instr[7,5]   = op2
   14120      instr[4,0]   = Rt  */
   14121   NYI_assert (31, 19, 0x1AA1);
   14122 
   14123   /* FIXME: For now we just silently accept system ops.  */
   14124 }
   14125 
   14126 static void
   14127 dexSystem (sim_cpu *cpu)
   14128 {
   14129   /* instr[31:22] = 1101 01010 0
   14130      instr[21]    = L
   14131      instr[20,19] = op0
   14132      instr[18,16] = op1
   14133      instr[15,12] = CRn
   14134      instr[11,8]  = CRm
   14135      instr[7,5]   = op2
   14136      instr[4,0]   = uimm5  */
   14137 
   14138   /* We are interested in HINT, DSB, DMB and ISB
   14139 
   14140      Hint #0 encodes NOOP (this is the only hint we care about)
   14141      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
   14142      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
   14143 
   14144      DSB, DMB, ISB are data store barrier, data memory barrier and
   14145      instruction store barrier, respectively, where
   14146 
   14147      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
   14148      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
   14149      CRm<3:2> ==> domain, CRm<1:0> ==> types,
   14150      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
   14151               10 ==> InerShareable, 11 ==> FullSystem
   14152      types :  01 ==> Reads, 10 ==> Writes,
   14153               11 ==> All, 00 ==> All (domain == FullSystem).  */
   14154 
   14155   unsigned rt = INSTR (4, 0);
   14156 
   14157   NYI_assert (31, 22, 0x354);
   14158 
   14159   switch (INSTR (21, 12))
   14160     {
   14161     case 0x032:
   14162       if (rt == 0x1F)
   14163 	{
   14164 	  /* NOP has CRm != 0000 OR.  */
   14165 	  /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
   14166 	  uint32_t crm = INSTR (11, 8);
   14167 	  uint32_t op2 = INSTR (7, 5);
   14168 
   14169 	  if (crm != 0 || (op2 == 0 || op2 > 5))
   14170 	    {
   14171 	      /* Actually call nop method so we can reimplement it later.  */
   14172 	      nop (cpu);
   14173 	      return;
   14174 	    }
   14175 	}
   14176       HALT_NYI;
   14177 
   14178     case 0x033:
   14179       {
   14180 	uint32_t op2 =  INSTR (7, 5);
   14181 
   14182 	switch (op2)
   14183 	  {
   14184 	  case 2: HALT_NYI;
   14185 	  case 4: dsb (cpu); return;
   14186 	  case 5: dmb (cpu); return;
   14187 	  case 6: isb (cpu); return;
   14188 	  default: HALT_UNALLOC;
   14189 	}
   14190       }
   14191 
   14192     case 0x3B0:
   14193     case 0x3B4:
   14194     case 0x3BD:
   14195       do_mrs (cpu);
   14196       return;
   14197 
   14198     case 0x0B7:
   14199       do_SYS (cpu); /* DC is an alias of SYS.  */
   14200       return;
   14201 
   14202     default:
   14203       if (INSTR (21, 20) == 0x1)
   14204 	do_MSR_reg (cpu);
   14205       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
   14206 	do_MSR_immediate (cpu);
   14207       else
   14208 	HALT_NYI;
   14209       return;
   14210     }
   14211 }
   14212 
   14213 static void
   14214 dexBr (sim_cpu *cpu)
   14215 {
   14216   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
   14217      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
   14218      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
   14219   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
   14220 
   14221   switch (group2)
   14222     {
   14223     case BR_IMM_000:
   14224       return dexBranchImmediate (cpu);
   14225 
   14226     case BR_IMMCMP_001:
   14227       /* Compare has bit 25 clear while test has it set.  */
   14228       if (!INSTR (25, 25))
   14229 	dexCompareBranchImmediate (cpu);
   14230       else
   14231 	dexTestBranchImmediate (cpu);
   14232       return;
   14233 
   14234     case BR_IMMCOND_010:
   14235       /* This is a conditional branch if bit 25 is clear otherwise
   14236          unallocated.  */
   14237       if (!INSTR (25, 25))
   14238 	dexCondBranchImmediate (cpu);
   14239       else
   14240 	HALT_UNALLOC;
   14241       return;
   14242 
   14243     case BR_UNALLOC_011:
   14244       HALT_UNALLOC;
   14245 
   14246     case BR_IMM_100:
   14247       dexBranchImmediate (cpu);
   14248       return;
   14249 
   14250     case BR_IMMCMP_101:
   14251       /* Compare has bit 25 clear while test has it set.  */
   14252       if (!INSTR (25, 25))
   14253 	dexCompareBranchImmediate (cpu);
   14254       else
   14255 	dexTestBranchImmediate (cpu);
   14256       return;
   14257 
   14258     case BR_REG_110:
   14259       /* Unconditional branch reg has bit 25 set.  */
   14260       if (INSTR (25, 25))
   14261 	dexBranchRegister (cpu);
   14262 
   14263       /* This includes both Excpn Gen, System and unalloc operations.
   14264          We need to decode the Excpn Gen operation BRK so we can plant
   14265          debugger entry points.
   14266          Excpn Gen operations have instr [24] = 0.
   14267          we need to decode at least one of the System operations NOP
   14268          which is an alias for HINT #0.
   14269          System operations have instr [24,22] = 100.  */
   14270       else if (INSTR (24, 24) == 0)
   14271 	dexExcpnGen (cpu);
   14272 
   14273       else if (INSTR (24, 22) == 4)
   14274 	dexSystem (cpu);
   14275 
   14276       else
   14277 	HALT_UNALLOC;
   14278 
   14279       return;
   14280 
   14281     case BR_UNALLOC_111:
   14282       HALT_UNALLOC;
   14283 
   14284     default:
   14285       /* Should never reach here.  */
   14286       HALT_NYI;
   14287     }
   14288 }
   14289 
   14290 static void
   14291 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
   14292 {
   14293   /* We need to check if gdb wants an in here.  */
   14294   /* checkBreak (cpu);.  */
   14295 
   14296   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
   14297 
   14298   switch (group)
   14299     {
   14300     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
   14301     case GROUP_LDST_0100:     dexLdSt (cpu); break;
   14302     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
   14303     case GROUP_LDST_0110:     dexLdSt (cpu); break;
   14304     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
   14305     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
   14306     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
   14307     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
   14308     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
   14309     case GROUP_LDST_1100:     dexLdSt (cpu); break;
   14310     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
   14311     case GROUP_LDST_1110:     dexLdSt (cpu); break;
   14312     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
   14313 
   14314     case GROUP_UNALLOC_0001:
   14315     case GROUP_UNALLOC_0010:
   14316     case GROUP_UNALLOC_0011:
   14317       HALT_UNALLOC;
   14318 
   14319     default:
   14320       /* Should never reach here.  */
   14321       HALT_NYI;
   14322     }
   14323 }
   14324 
   14325 static bfd_boolean
   14326 aarch64_step (sim_cpu *cpu)
   14327 {
   14328   uint64_t pc = aarch64_get_PC (cpu);
   14329 
   14330   if (pc == TOP_LEVEL_RETURN_PC)
   14331     return FALSE;
   14332 
   14333   aarch64_set_next_PC (cpu, pc + 4);
   14334 
   14335   /* Code is always little-endian.  */
   14336   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
   14337 			& aarch64_get_instr (cpu), pc, 4);
   14338   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
   14339 
   14340   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
   14341 	      aarch64_get_instr (cpu));
   14342   TRACE_DISASM (cpu, pc);
   14343 
   14344   aarch64_decode_and_execute (cpu, pc);
   14345 
   14346   return TRUE;
   14347 }
   14348 
   14349 void
   14350 aarch64_run (SIM_DESC sd)
   14351 {
   14352   sim_cpu *cpu = STATE_CPU (sd, 0);
   14353 
   14354   while (aarch64_step (cpu))
   14355     {
   14356       aarch64_update_PC (cpu);
   14357 
   14358       if (sim_events_tick (sd))
   14359 	sim_events_process (sd);
   14360     }
   14361 
   14362   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
   14363 		   sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
   14364 }
   14365 
   14366 void
   14367 aarch64_init (sim_cpu *cpu, uint64_t pc)
   14368 {
   14369   uint64_t sp = aarch64_get_stack_start (cpu);
   14370 
   14371   /* Install SP, FP and PC and set LR to -20
   14372      so we can detect a top-level return.  */
   14373   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
   14374   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
   14375   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
   14376   aarch64_set_next_PC (cpu, pc);
   14377   aarch64_update_PC (cpu);
   14378   aarch64_init_LIT_table ();
   14379 }
   14380