sim/aarch64/simulator.c

/* simulator.c -- Interface for the AArch64 simulator.

   Copyright (C) 2015-2020 Free Software Foundation, Inc.

   Contributed by Red Hat.

   This file is part of GDB.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

#include "config.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <math.h>
#include <time.h>
#include <limits.h>

#include "simulator.h"
#include "cpustate.h"
#include "memory.h"

#define NO_SP 0
#define SP_OK 1

#define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
#define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
#define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)

/* Space saver macro.  */
#define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))

#define HALT_UNALLOC							\
  do									\
    {									\
      TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
      TRACE_INSN (cpu,							\
		  "Unallocated instruction detected at sim line %d,"	\
		  " exe addr %" PRIx64,					\
		  __LINE__, aarch64_get_PC (cpu));			\
      sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
		       sim_stopped, SIM_SIGILL);			\
    }									\
  while (0)

#define HALT_NYI							\
  do									\
    {									\
      TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
      TRACE_INSN (cpu,							\
		  "Unimplemented instruction detected at sim line %d,"	\
		  " exe addr %" PRIx64,					\
		  __LINE__, aarch64_get_PC (cpu));			\
      if (! TRACE_ANY_P (cpu))						\
        sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
                        aarch64_get_instr (cpu));			\
      sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
		       sim_stopped, SIM_SIGABRT);			\
    }									\
  while (0)

#define NYI_assert(HI, LO, EXPECTED)					\
  do									\
    {									\
      if (INSTR ((HI), (LO)) != (EXPECTED))				\
	HALT_NYI;							\
    }									\
  while (0)

/* Helper functions used by expandLogicalImmediate.  */

/* for i = 1, ... N result<i-1> = 1 other bits are zero  */
static inline uint64_t
ones (int N)
{
  return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
}

/* result<0> to val<N>  */
static inline uint64_t
pickbit (uint64_t val, int N)
{
  return pickbits64 (val, N, N);
}

static uint64_t
expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
{
  uint64_t mask;
  uint64_t imm;
  unsigned simd_size;

  /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
     (in other words, right rotated by R), then replicated. */
  if (N != 0)
    {
      simd_size = 64;
      mask = 0xffffffffffffffffull;
    }
  else
    {
      switch (S)
	{
	case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
	case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
	case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
	case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
	case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
	default: return 0;
	}
      mask = (1ull << simd_size) - 1;
      /* Top bits are IGNORED.  */
      R &= simd_size - 1;
    }

  /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
  if (S == simd_size - 1)
    return 0;

  /* S+1 consecutive bits to 1.  */
  /* NOTE: S can't be 63 due to detection above.  */
  imm = (1ull << (S + 1)) - 1;

  /* Rotate to the left by simd_size - R.  */
  if (R != 0)
    imm = ((imm << (simd_size - R)) & mask) | (imm >> R);

  /* Replicate the value according to SIMD size.  */
  switch (simd_size)
    {
    case  2: imm = (imm <<  2) | imm;
    case  4: imm = (imm <<  4) | imm;
    case  8: imm = (imm <<  8) | imm;
    case 16: imm = (imm << 16) | imm;
    case 32: imm = (imm << 32) | imm;
    case 64: break;
    default: return 0;
    }

  return imm;
}

/* Instr[22,10] encodes N immr and imms. we want a lookup table
   for each possible combination i.e. 13 bits worth of int entries.  */
#define  LI_TABLE_SIZE  (1 << 13)
static uint64_t LITable[LI_TABLE_SIZE];

void
aarch64_init_LIT_table (void)
{
  unsigned index;

  for (index = 0; index < LI_TABLE_SIZE; index++)
    {
      uint32_t N    = uimm (index, 12, 12);
      uint32_t immr = uimm (index, 11, 6);
      uint32_t imms = uimm (index, 5, 0);

      LITable [index] = expand_logical_immediate (imms, immr, N);
    }
}

static void
dexNotify (sim_cpu *cpu)
{
  /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
                           2 ==> exit Java, 3 ==> start next bytecode.  */
  uint32_t type = INSTR (14, 0);

  TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);

  switch (type)
    {
    case 0:
      /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
	 aarch64_get_reg_u64 (cpu, R22, 0));  */
      break;
    case 1:
      /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
	 aarch64_get_reg_u64 (cpu, R22, 0));  */
      break;
    case 2:
      /* aarch64_notifyMethodExit ();  */
      break;
    case 3:
      /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
	 aarch64_get_reg_u64 (cpu, R22, 0));  */
      break;
    }
}

/* secondary decode within top level groups  */

static void
dexPseudo (sim_cpu *cpu)
{
  /* assert instr[28,27] = 00

     We provide 2 pseudo instructions:

     HALT stops execution of the simulator causing an immediate
     return to the x86 code which entered it.

     CALLOUT initiates recursive entry into x86 code.  A register
     argument holds the address of the x86 routine.  Immediate
     values in the instruction identify the number of general
     purpose and floating point register arguments to be passed
     and the type of any value to be returned.  */

  uint32_t PSEUDO_HALT      =  0xE0000000U;
  uint32_t PSEUDO_CALLOUT   =  0x00018000U;
  uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
  uint32_t PSEUDO_NOTIFY    =  0x00014000U;
  uint32_t dispatch;

  if (aarch64_get_instr (cpu) == PSEUDO_HALT)
    {
      TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
      sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
		       sim_stopped, SIM_SIGTRAP);
    }

  dispatch = INSTR (31, 15);

  /* We do not handle callouts at the moment.  */
  if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
    {
      TRACE_EVENTS (cpu, " Callout");
      sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
		       sim_stopped, SIM_SIGABRT);
    }

  else if (dispatch == PSEUDO_NOTIFY)
    dexNotify (cpu);

  else
    HALT_UNALLOC;
}

/* Load-store single register (unscaled offset)
   These instructions employ a base register plus an unscaled signed
   9 bit offset.

   N.B. the base register (source) can be Xn or SP. all other
   registers may not be SP.  */

/* 32 bit load 32 bit unscaled signed 9 bit.  */
static void
ldur32 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* 64 bit load 64 bit unscaled signed 9 bit.  */
static void
ldur64 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* 32 bit load zero-extended byte unscaled signed 9 bit.  */
static void
ldurb32 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* 32 bit load sign-extended byte unscaled signed 9 bit.  */
static void
ldursb32 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* 64 bit load sign-extended byte unscaled signed 9 bit.  */
static void
ldursb64 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* 32 bit load zero-extended short unscaled signed 9 bit  */
static void
ldurh32 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* 32 bit load sign-extended short unscaled signed 9 bit  */
static void
ldursh32 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* 64 bit load sign-extended short unscaled signed 9 bit  */
static void
ldursh64 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* 64 bit load sign-extended word unscaled signed 9 bit  */
static void
ldursw (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ offset));
}

/* N.B. with stores the value in source is written to the address
   identified by source2 modified by offset.  */

/* 32 bit store 32 bit unscaled signed 9 bit.  */
static void
stur32 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u32 (cpu,
		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
		       aarch64_get_reg_u32 (cpu, rd, NO_SP));
}

/* 64 bit store 64 bit unscaled signed 9 bit  */
static void
stur64 (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u64 (cpu,
		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
		       aarch64_get_reg_u64 (cpu, rd, NO_SP));
}

/* 32 bit store byte unscaled signed 9 bit  */
static void
sturb (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u8 (cpu,
		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
		      aarch64_get_reg_u8 (cpu, rd, NO_SP));
}

/* 32 bit store short unscaled signed 9 bit  */
static void
sturh (sim_cpu *cpu, int32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u16 (cpu,
		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
		       aarch64_get_reg_u16 (cpu, rd, NO_SP));
}

/* Load single register pc-relative label
   Offset is a signed 19 bit immediate count in words
   rt may not be SP.  */

/* 32 bit pc-relative load  */
static void
ldr32_pcrel (sim_cpu *cpu, int32_t offset)
{
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_mem_u32
		       (cpu, aarch64_get_PC (cpu) + offset * 4));
}

/* 64 bit pc-relative load  */
static void
ldr_pcrel (sim_cpu *cpu, int32_t offset)
{
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_mem_u64
		       (cpu, aarch64_get_PC (cpu) + offset * 4));
}

/* sign extended 32 bit pc-relative load  */
static void
ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
{
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_mem_s32
		       (cpu, aarch64_get_PC (cpu) + offset * 4));
}

/* float pc-relative load  */
static void
fldrs_pcrel (sim_cpu *cpu, int32_t offset)
{
  unsigned int rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u32 (cpu, rd, 0,
		       aarch64_get_mem_u32
		       (cpu, aarch64_get_PC (cpu) + offset * 4));
}

/* double pc-relative load  */
static void
fldrd_pcrel (sim_cpu *cpu, int32_t offset)
{
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, st, 0,
		       aarch64_get_mem_u64
		       (cpu, aarch64_get_PC (cpu) + offset * 4));
}

/* long double pc-relative load.  */
static void
fldrq_pcrel (sim_cpu *cpu, int32_t offset)
{
  unsigned int st = INSTR (4, 0);
  uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
  FRegister a;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_get_mem_long_double (cpu, addr, & a);
  aarch64_set_FP_long_double (cpu, st, a);
}

/* This can be used to scale an offset by applying
   the requisite shift. the second argument is either
   16, 32 or 64.  */

#define SCALE(_offset, _elementSize) \
    ((_offset) << ScaleShift ## _elementSize)

/* This can be used to optionally scale a register derived offset
   by applying the requisite shift as indicated by the Scaling
   argument.  The second argument is either Byte, Short, Word
   or Long. The third argument is either Scaled or Unscaled.
   N.B. when _Scaling is Scaled the shift gets ANDed with
   all 1s while when it is Unscaled it gets ANDed with 0.  */

#define OPT_SCALE(_offset, _elementType, _Scaling) \
  ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))

/* This can be used to zero or sign extend a 32 bit register derived
   value to a 64 bit value.  the first argument must be the value as
   a uint32_t and the second must be either UXTW or SXTW. The result
   is returned as an int64_t.  */

static inline int64_t
extend (uint32_t value, Extension extension)
{
  union
  {
    uint32_t u;
    int32_t   n;
  } x;

  /* A branchless variant of this ought to be possible.  */
  if (extension == UXTW || extension == NoExtension)
    return value;

  x.u = value;
  return x.n;
}

/* Scalar Floating Point

   FP load/store single register (4 addressing modes)

   N.B. the base register (source) can be the stack pointer.
   The secondary source register (source2) can only be an Xn register.  */

/* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
static void
fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* Load 8 bit with unsigned 12 bit offset.  */
static void
fldrb_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rd = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);
  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
}

/* Load 16 bit scaled unsigned 12 bit.  */
static void
fldrh_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rd = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);
  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
}

/* Load 32 bit scaled unsigned 12 bit.  */
static void
fldrs_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rd = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);
  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
}

/* Load 64 bit scaled unsigned 12 bit.  */
static void
fldrd_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rd = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);
  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
}

/* Load 128 bit scaled unsigned 12 bit.  */
static void
fldrq_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rd = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);
  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
  aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
}

/* Load 32 bit scaled or unscaled zero- or sign-extended
   32-bit register offset.  */
static void
fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement = OPT_SCALE (extended, 32, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
		       (cpu, address + displacement));
}

/* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
static void
fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
static void
fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement = OPT_SCALE (extended, 64, scaling);

  fldrd_wb (cpu, displacement, NoWriteBack);
}

/* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
static void
fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  FRegister a;
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_get_mem_long_double (cpu, address, & a);
  aarch64_set_FP_long_double (cpu, st, a);

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
static void
fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement = OPT_SCALE (extended, 128, scaling);

  fldrq_wb (cpu, displacement, NoWriteBack);
}

/* Memory Access

   load-store single register
   There are four addressing modes available here which all employ a
   64 bit source (base) register.

   N.B. the base register (source) can be the stack pointer.
   The secondary source register (source2)can only be an Xn register.

   Scaled, 12-bit, unsigned immediate offset, without pre- and
   post-index options.
   Unscaled, 9-bit, signed immediate offset with pre- or post-index
   writeback.
   scaled or unscaled 64-bit register offset.
   scaled or unscaled 32-bit extended register offset.

   All offsets are assumed to be raw from the decode i.e. the
   simulator is expected to adjust scaled offsets based on the
   accessed data size with register or extended register offset
   versions the same applies except that in the latter case the
   operation may also require a sign extend.

   A separate method is provided for each possible addressing mode.  */

/* 32 bit load 32 bit scaled unsigned 12 bit  */
static void
ldr32_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be.  */
  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ SCALE (offset, 32)));
}

/* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
static void
ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 32 bit load 32 bit scaled or unscaled
   zero- or sign-extended 32-bit register offset  */
static void
ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement =  OPT_SCALE (extended, 32, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP,
		       aarch64_get_mem_u32 (cpu, address + displacement));
}

/* 64 bit load 64 bit scaled unsigned 12 bit  */
static void
ldr_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be.  */
  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			+ SCALE (offset, 64)));
}

/* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
static void
ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 64 bit load 64 bit scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement =  OPT_SCALE (extended, 64, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP,
		       aarch64_get_mem_u64 (cpu, address + displacement));
}

/* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
static void
ldrb32_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be
     there is no scaling required for a byte load.  */
  aarch64_set_reg_u64 (cpu, rt, NO_SP,
		       aarch64_get_mem_u8
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
}

/* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
static void
ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 32 bit load zero-extended byte scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
				 extension);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* There is no scaling required for a byte load.  */
  aarch64_set_reg_u64 (cpu, rt, NO_SP,
		       aarch64_get_mem_u8 (cpu, address + displacement));
}

/* 64 bit load sign-extended byte unscaled signed 9 bit
   with pre- or post-writeback.  */
static void
ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;
  int64_t val;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  val = aarch64_get_mem_s8 (cpu, address);
  aarch64_set_reg_s64 (cpu, rt, NO_SP, val);

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
static void
ldrsb_abs (sim_cpu *cpu, uint32_t offset)
{
  ldrsb_wb (cpu, offset, NoWriteBack);
}

/* 64 bit load sign-extended byte scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
				 extension);
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* There is no scaling required for a byte load.  */
  aarch64_set_reg_s64 (cpu, rt, NO_SP,
		       aarch64_get_mem_s8 (cpu, address + displacement));
}

/* 32 bit load zero-extended short scaled unsigned 12 bit.  */
static void
ldrh32_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint32_t val;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be.  */
  val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			     + SCALE (offset, 16));
  aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
}

/* 32 bit load zero-extended short unscaled signed 9 bit
   with pre- or post-writeback.  */
static void
ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 32 bit load zero-extended short scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement =  OPT_SCALE (extended, 16, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u32 (cpu, rt, NO_SP,
		       aarch64_get_mem_u16 (cpu, address + displacement));
}

/* 32 bit load sign-extended short scaled unsigned 12 bit.  */
static void
ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  int32_t val;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be.  */
  val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			     + SCALE (offset, 16));
  aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
}

/* 32 bit load sign-extended short unscaled signed 9 bit
   with pre- or post-writeback.  */
static void
ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_s32 (cpu, rt, NO_SP,
		       (int32_t) aarch64_get_mem_s16 (cpu, address));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 32 bit load sign-extended short scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement =  OPT_SCALE (extended, 16, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_s32 (cpu, rt, NO_SP,
		       (int32_t) aarch64_get_mem_s16
		       (cpu, address + displacement));
}

/* 64 bit load sign-extended short scaled unsigned 12 bit.  */
static void
ldrsh_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  int64_t val;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be.  */
  val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			      + SCALE (offset, 16));
  aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
}

/* 64 bit load sign-extended short unscaled signed 9 bit
   with pre- or post-writeback.  */
static void
ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;
  int64_t val;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  val = aarch64_get_mem_s16 (cpu, address);
  aarch64_set_reg_s64 (cpu, rt, NO_SP, val);

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 64 bit load sign-extended short scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement = OPT_SCALE (extended, 16, scaling);
  int64_t val;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  val = aarch64_get_mem_s16 (cpu, address + displacement);
  aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
}

/* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
static void
ldrsw_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  int64_t val;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
			     + SCALE (offset, 32));
  /* The target register may not be SP but the source may be.  */
  return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
}

/* 64 bit load sign-extended 32 bit unscaled signed 9 bit
   with pre- or post-writeback.  */
static void
ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 64 bit load sign-extended 32 bit scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement =  OPT_SCALE (extended, 32, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_s64 (cpu, rt, NO_SP,
		       aarch64_get_mem_s32 (cpu, address + displacement));
}

/* N.B. with stores the value in source is written to the
   address identified by source2 modified by source3/offset.  */

/* 32 bit store scaled unsigned 12 bit.  */
static void
str32_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be.  */
  aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
			     + SCALE (offset, 32)),
		       aarch64_get_reg_u32 (cpu, rt, NO_SP));
}

/* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
static void
str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 32 bit store scaled or unscaled zero- or
   sign-extended 32-bit register offset.  */
static void
str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement = OPT_SCALE (extended, 32, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u32 (cpu, address + displacement,
		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
}

/* 64 bit store scaled unsigned 12 bit.  */
static void
str_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u64 (cpu,
		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
		       + SCALE (offset, 64),
		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
}

/* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
static void
str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 64 bit store scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			       extension);
  uint64_t displacement = OPT_SCALE (extended, 64, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u64 (cpu, address + displacement,
		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
}

/* 32 bit store byte scaled unsigned 12 bit.  */
static void
strb_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be.
     There is no scaling required for a byte load.  */
  aarch64_set_mem_u8 (cpu,
		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
}

/* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
static void
strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 32 bit store byte scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
				 extension);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* There is no scaling required for a byte load.  */
  aarch64_set_mem_u8 (cpu, address + displacement,
		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
}

/* 32 bit store short scaled unsigned 12 bit.  */
static void
strh_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* The target register may not be SP but the source may be.  */
  aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
		       + SCALE (offset, 16),
		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
}

/* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
static void
strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address;

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 32 bit store short scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  /* rn may reference SP, rm and rt must reference ZR  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
  uint64_t displacement =  OPT_SCALE (extended, 16, scaling);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u16 (cpu, address + displacement,
		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
}

/* Prefetch unsigned 12 bit.  */
static void
prfm_abs (sim_cpu *cpu, uint32_t offset)
{
  /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
                          00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
                          00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
                          10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
                          10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
                          10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
                          ow ==> UNALLOC
     PrfOp prfop = prfop (instr, 4, 0);
     uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
     + SCALE (offset, 64).  */

  /* TODO : implement prefetch of address.  */
}

/* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
static void
prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
                          00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
                          00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
                          10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
                          10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
                          10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
                          ow ==> UNALLOC
     rn may reference SP, rm may only reference ZR
     PrfOp prfop = prfop (instr, 4, 0);
     uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
     int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
                                extension);
     uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
     uint64_t address = base + displacement.  */

  /* TODO : implement prefetch of address  */
}

/* 64 bit pc-relative prefetch.  */
static void
prfm_pcrel (sim_cpu *cpu, int32_t offset)
{
  /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
                          00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
                          00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
                          10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
                          10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
                          10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
                          ow ==> UNALLOC
     PrfOp prfop = prfop (instr, 4, 0);
     uint64_t address = aarch64_get_PC (cpu) + offset.  */

  /* TODO : implement this  */
}

/* Load-store exclusive.  */

static void
ldxr (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int size = INSTR (31, 30);
  /* int ordered = INSTR (15, 15);  */
  /* int exclusive = ! INSTR (23, 23);  */

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (size)
    {
    case 0:
      aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
      break;
    case 1:
      aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
      break;
    case 2:
      aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
      break;
    case 3:
      aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
      break;
    }
}

static void
stxr (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rt = INSTR (4, 0);
  unsigned rs = INSTR (20, 16);
  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int      size = INSTR (31, 30);
  uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);

  switch (size)
    {
    case 0: aarch64_set_mem_u8 (cpu, address, data); break;
    case 1: aarch64_set_mem_u16 (cpu, address, data); break;
    case 2: aarch64_set_mem_u32 (cpu, address, data); break;
    case 3: aarch64_set_mem_u64 (cpu, address, data); break;
    }

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
}

static void
dexLoadLiteral (sim_cpu *cpu)
{
  /* instr[29,27] == 011
     instr[25,24] == 00
     instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
                            010 ==> LDRX,  011 ==> FLDRD
                            100 ==> LDRSW, 101 ==> FLDRQ
                            110 ==> PRFM, 111 ==> UNALLOC
     instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
     instr[23, 5] == simm19  */

  /* unsigned rt = INSTR (4, 0);  */
  uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
  int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);

  switch (dispatch)
    {
    case 0: ldr32_pcrel (cpu, imm); break;
    case 1: fldrs_pcrel (cpu, imm); break;
    case 2: ldr_pcrel   (cpu, imm); break;
    case 3: fldrd_pcrel (cpu, imm); break;
    case 4: ldrsw_pcrel (cpu, imm); break;
    case 5: fldrq_pcrel (cpu, imm); break;
    case 6: prfm_pcrel  (cpu, imm); break;
    case 7:
    default:
      HALT_UNALLOC;
    }
}

/* Immediate arithmetic
   The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
   value left shifted by 12 bits (done at decode).

   N.B. the register args (dest, source) can normally be Xn or SP.
   the exception occurs for flag setting instructions which may
   only use Xn for the output (dest).  */

/* 32 bit add immediate.  */
static void
add32 (sim_cpu *cpu, uint32_t aimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
}

/* 64 bit add immediate.  */
static void
add64 (sim_cpu *cpu, uint32_t aimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
}

static void
set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
{
  int32_t   result = value1 + value2;
  int64_t   sresult = (int64_t) value1 + (int64_t) value2;
  uint64_t  uresult = (uint64_t)(uint32_t) value1
    + (uint64_t)(uint32_t) value2;
  uint32_t  flags = 0;

  if (result == 0)
    flags |= Z;

  if (result & (1 << 31))
    flags |= N;

  if (uresult != (uint32_t)result)
    flags |= C;

  if (sresult != result)
    flags |= V;

  aarch64_set_CPSR (cpu, flags);
}

#define NEG(a) (((a) & signbit) == signbit)
#define POS(a) (((a) & signbit) == 0)

static void
set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
{
  uint64_t result = value1 + value2;
  uint32_t flags = 0;
  uint64_t signbit = 1ULL << 63;

  if (result == 0)
    flags |= Z;

  if (NEG (result))
    flags |= N;

  if (   (NEG (value1) && NEG (value2))
      || (NEG (value1) && POS (result))
      || (NEG (value2) && POS (result)))
    flags |= C;

  if (   (NEG (value1) && NEG (value2) && POS (result))
      || (POS (value1) && POS (value2) && NEG (result)))
    flags |= V;

  aarch64_set_CPSR (cpu, flags);
}

static void
set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
{
  uint32_t result = value1 - value2;
  uint32_t flags = 0;
  uint32_t signbit = 1U << 31;

  if (result == 0)
    flags |= Z;

  if (NEG (result))
    flags |= N;

  if (   (NEG (value1) && POS (value2))
      || (NEG (value1) && POS (result))
      || (POS (value2) && POS (result)))
    flags |= C;

  if (   (NEG (value1) && POS (value2) && POS (result))
      || (POS (value1) && NEG (value2) && NEG (result)))
    flags |= V;

  aarch64_set_CPSR (cpu, flags);
}

static void
set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
{
  uint64_t result = value1 - value2;
  uint32_t flags = 0;
  uint64_t signbit = 1ULL << 63;

  if (result == 0)
    flags |= Z;

  if (NEG (result))
    flags |= N;

  if (   (NEG (value1) && POS (value2))
      || (NEG (value1) && POS (result))
      || (POS (value2) && POS (result)))
    flags |= C;

  if (   (NEG (value1) && POS (value2) && POS (result))
      || (POS (value1) && NEG (value2) && NEG (result)))
    flags |= V;

  aarch64_set_CPSR (cpu, flags);
}

static void
set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
{
  uint32_t flags = 0;

  if (result == 0)
    flags |= Z;
  else
    flags &= ~ Z;

  if (result & (1 << 31))
    flags |= N;
  else
    flags &= ~ N;

  aarch64_set_CPSR (cpu, flags);
}

static void
set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
{
  uint32_t flags = 0;

  if (result == 0)
    flags |= Z;
  else
    flags &= ~ Z;

  if (result & (1ULL << 63))
    flags |= N;
  else
    flags &= ~ N;

  aarch64_set_CPSR (cpu, flags);
}

/* 32 bit add immediate set flags.  */
static void
adds32 (sim_cpu *cpu, uint32_t aimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  /* TODO : do we need to worry about signs here?  */
  int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
  set_flags_for_add32 (cpu, value1, aimm);
}

/* 64 bit add immediate set flags.  */
static void
adds64 (sim_cpu *cpu, uint32_t aimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  uint64_t value2 = aimm;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
  set_flags_for_add64 (cpu, value1, value2);
}

/* 32 bit sub immediate.  */
static void
sub32 (sim_cpu *cpu, uint32_t aimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
}

/* 64 bit sub immediate.  */
static void
sub64 (sim_cpu *cpu, uint32_t aimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
}

/* 32 bit sub immediate set flags.  */
static void
subs32 (sim_cpu *cpu, uint32_t aimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  uint32_t value2 = aimm;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
  set_flags_for_sub32 (cpu, value1, value2);
}

/* 64 bit sub immediate set flags.  */
static void
subs64 (sim_cpu *cpu, uint32_t aimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  uint32_t value2 = aimm;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
  set_flags_for_sub64 (cpu, value1, value2);
}

/* Data Processing Register.  */

/* First two helpers to perform the shift operations.  */

static inline uint32_t
shifted32 (uint32_t value, Shift shift, uint32_t count)
{
  switch (shift)
    {
    default:
    case LSL:
      return (value << count);
    case LSR:
      return (value >> count);
    case ASR:
      {
	int32_t svalue = value;
	return (svalue >> count);
      }
    case ROR:
      {
	uint32_t top = value >> count;
	uint32_t bottom = value << (32 - count);
	return (bottom | top);
      }
    }
}

static inline uint64_t
shifted64 (uint64_t value, Shift shift, uint32_t count)
{
  switch (shift)
    {
    default:
    case LSL:
      return (value << count);
    case LSR:
      return (value >> count);
    case ASR:
      {
	int64_t svalue = value;
	return (svalue >> count);
      }
    case ROR:
      {
	uint64_t top = value >> count;
	uint64_t bottom = value << (64 - count);
	return (bottom | top);
      }
    }
}

/* Arithmetic shifted register.
   These allow an optional LSL, ASR or LSR to the second source
   register with a count up to the register bit count.

   N.B register args may not be SP.  */

/* 32 bit ADD shifted register.  */
static void
add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
				    shift, count));
}

/* 64 bit ADD shifted register.  */
static void
add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
				    shift, count));
}

/* 32 bit ADD shifted register setting flags.  */
static void
adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			       shift, count);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
  set_flags_for_add32 (cpu, value1, value2);
}

/* 64 bit ADD shifted register setting flags.  */
static void
adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
			       shift, count);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
  set_flags_for_add64 (cpu, value1, value2);
}

/* 32 bit SUB shifted register.  */
static void
sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
				    shift, count));
}

/* 64 bit SUB shifted register.  */
static void
sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
				    shift, count));
}

/* 32 bit SUB shifted register setting flags.  */
static void
subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			      shift, count);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
  set_flags_for_sub32 (cpu, value1, value2);
}

/* 64 bit SUB shifted register setting flags.  */
static void
subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
			       shift, count);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
  set_flags_for_sub64 (cpu, value1, value2);
}

/* First a couple more helpers to fetch the
   relevant source register element either
   sign or zero extended as required by the
   extension value.  */

static uint32_t
extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
{
  switch (extension)
    {
    case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
    case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
    case UXTW: /* Fall through.  */
    case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
    case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
    case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
    case SXTW: /* Fall through.  */
    case SXTX: /* Fall through.  */
    default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
  }
}

static uint64_t
extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
{
  switch (extension)
    {
    case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
    case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
    case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
    case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
    case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
    case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
    case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
    case SXTX:
    default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
    }
}

/* Arithmetic extending register
   These allow an optional sign extension of some portion of the
   second source register followed by an optional left shift of
   between 1 and 4 bits (i.e. a shift of 0-4 bits???)

   N.B output (dest) and first input arg (source) may normally be Xn
   or SP. However, for flag setting operations dest can only be
   Xn. Second input registers are always Xn.  */

/* 32 bit ADD extending register.  */
static void
add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
		       + (extreg32 (cpu, rm, extension) << shift));
}

/* 64 bit ADD extending register.
   N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
static void
add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
		       + (extreg64 (cpu, rm, extension) << shift));
}

/* 32 bit ADD extending register setting flags.  */
static void
adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
  uint32_t value2 = extreg32 (cpu, rm, extension) << shift;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
  set_flags_for_add32 (cpu, value1, value2);
}

/* 64 bit ADD extending register setting flags  */
/* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
static void
adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  uint64_t value2 = extreg64 (cpu, rm, extension) << shift;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
  set_flags_for_add64 (cpu, value1, value2);
}

/* 32 bit SUB extending register.  */
static void
sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
		       - (extreg32 (cpu, rm, extension) << shift));
}

/* 64 bit SUB extending register.  */
/* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
static void
sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
		       - (extreg64 (cpu, rm, extension) << shift));
}

/* 32 bit SUB extending register setting flags.  */
static void
subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
  uint32_t value2 = extreg32 (cpu, rm, extension) << shift;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
  set_flags_for_sub32 (cpu, value1, value2);
}

/* 64 bit SUB extending register setting flags  */
/* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
static void
subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  uint64_t value2 = extreg64 (cpu, rm, extension) << shift;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
  set_flags_for_sub64 (cpu, value1, value2);
}

static void
dexAddSubtractImmediate (sim_cpu *cpu)
{
  /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30]    = op : 0 ==> ADD, 1 ==> SUB
     instr[29]    = set : 0 ==> no flags, 1 ==> set flags
     instr[28,24] = 10001
     instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
     instr[21,10] = uimm12
     instr[9,5]   = Rn
     instr[4,0]   = Rd  */

  /* N.B. the shift is applied at decode before calling the add/sub routine.  */
  uint32_t shift = INSTR (23, 22);
  uint32_t imm = INSTR (21, 10);
  uint32_t dispatch = INSTR (31, 29);

  NYI_assert (28, 24, 0x11);

  if (shift > 1)
    HALT_UNALLOC;

  if (shift)
    imm <<= 12;

  switch (dispatch)
    {
    case 0: add32 (cpu, imm); break;
    case 1: adds32 (cpu, imm); break;
    case 2: sub32 (cpu, imm); break;
    case 3: subs32 (cpu, imm); break;
    case 4: add64 (cpu, imm); break;
    case 5: adds64 (cpu, imm); break;
    case 6: sub64 (cpu, imm); break;
    case 7: subs64 (cpu, imm); break;
    }
}

static void
dexAddSubtractShiftedRegister (sim_cpu *cpu)
{
  /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
     instr[28,24] = 01011
     instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
     instr[21]    = 0
     instr[20,16] = Rm
     instr[15,10] = count : must be 0xxxxx for 32 bit
     instr[9,5]   = Rn
     instr[4,0]   = Rd  */

  uint32_t size = INSTR (31, 31);
  uint32_t count = INSTR (15, 10);
  Shift shiftType = INSTR (23, 22);

  NYI_assert (28, 24, 0x0B);
  NYI_assert (21, 21, 0);

  /* Shift encoded as ROR is unallocated.  */
  if (shiftType == ROR)
    HALT_UNALLOC;

  /* 32 bit operations must have count[5] = 0
     or else we have an UNALLOC.  */
  if (size == 0 && uimm (count, 5, 5))
    HALT_UNALLOC;

  /* Dispatch on size:op i.e instr [31,29].  */
  switch (INSTR (31, 29))
    {
    case 0: add32_shift  (cpu, shiftType, count); break;
    case 1: adds32_shift (cpu, shiftType, count); break;
    case 2: sub32_shift  (cpu, shiftType, count); break;
    case 3: subs32_shift (cpu, shiftType, count); break;
    case 4: add64_shift  (cpu, shiftType, count); break;
    case 5: adds64_shift (cpu, shiftType, count); break;
    case 6: sub64_shift  (cpu, shiftType, count); break;
    case 7: subs64_shift (cpu, shiftType, count); break;
    }
}

static void
dexAddSubtractExtendedRegister (sim_cpu *cpu)
{
  /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30]    = op : 0 ==> ADD, 1 ==> SUB
     instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
     instr[28,24] = 01011
     instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
     instr[21]    = 1
     instr[20,16] = Rm
     instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
                             000 ==> LSL|UXTW, 001 ==> UXTZ,
                             000 ==> SXTB, 001 ==> SXTH,
                             000 ==> SXTW, 001 ==> SXTX,
     instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
     instr[9,5]   = Rn
     instr[4,0]   = Rd  */

  Extension extensionType = INSTR (15, 13);
  uint32_t shift = INSTR (12, 10);

  NYI_assert (28, 24, 0x0B);
  NYI_assert (21, 21, 1);

  /* Shift may not exceed 4.  */
  if (shift > 4)
    HALT_UNALLOC;

  /* Dispatch on size:op:set?.  */
  switch (INSTR (31, 29))
    {
    case 0: add32_ext  (cpu, extensionType, shift); break;
    case 1: adds32_ext (cpu, extensionType, shift); break;
    case 2: sub32_ext  (cpu, extensionType, shift); break;
    case 3: subs32_ext (cpu, extensionType, shift); break;
    case 4: add64_ext  (cpu, extensionType, shift); break;
    case 5: adds64_ext (cpu, extensionType, shift); break;
    case 6: sub64_ext  (cpu, extensionType, shift); break;
    case 7: subs64_ext (cpu, extensionType, shift); break;
    }
}

/* Conditional data processing
   Condition register is implicit 3rd source.  */

/* 32 bit add with carry.  */
/* N.B register args may not be SP.  */

static void
adc32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       + aarch64_get_reg_u32 (cpu, rm, NO_SP)
		       + IS_SET (C));
}

/* 64 bit add with carry  */
static void
adc64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       + aarch64_get_reg_u64 (cpu, rm, NO_SP)
		       + IS_SET (C));
}

/* 32 bit add with carry setting flags.  */
static void
adcs32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
  uint32_t carry = IS_SET (C);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
  set_flags_for_add32 (cpu, value1, value2 + carry);
}

/* 64 bit add with carry setting flags.  */
static void
adcs64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
  uint64_t carry = IS_SET (C);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
  set_flags_for_add64 (cpu, value1, value2 + carry);
}

/* 32 bit sub with carry.  */
static void
sbc32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       - aarch64_get_reg_u32 (cpu, rm, NO_SP)
		       - 1 + IS_SET (C));
}

/* 64 bit sub with carry  */
static void
sbc64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       - aarch64_get_reg_u64 (cpu, rm, NO_SP)
		       - 1 + IS_SET (C));
}

/* 32 bit sub with carry setting flags  */
static void
sbcs32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
  uint32_t carry  = IS_SET (C);
  uint32_t result = value1 - value2 + 1 - carry;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
  set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
}

/* 64 bit sub with carry setting flags  */
static void
sbcs64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
  uint64_t carry  = IS_SET (C);
  uint64_t result = value1 - value2 + 1 - carry;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
  set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
}

static void
dexAddSubtractWithCarry (sim_cpu *cpu)
{
  /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30]    = op : 0 ==> ADC, 1 ==> SBC
     instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
     instr[28,21] = 1 1010 000
     instr[20,16] = Rm
     instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
     instr[9,5]   = Rn
     instr[4,0]   = Rd  */

  uint32_t op2 = INSTR (15, 10);

  NYI_assert (28, 21, 0xD0);

  if (op2 != 0)
    HALT_UNALLOC;

  /* Dispatch on size:op:set?.  */
  switch (INSTR (31, 29))
    {
    case 0: adc32 (cpu); break;
    case 1: adcs32 (cpu); break;
    case 2: sbc32 (cpu); break;
    case 3: sbcs32 (cpu); break;
    case 4: adc64 (cpu); break;
    case 5: adcs64 (cpu); break;
    case 6: sbc64 (cpu); break;
    case 7: sbcs64 (cpu); break;
    }
}

static uint32_t
testConditionCode (sim_cpu *cpu, CondCode cc)
{
  /* This should be reduceable to branchless logic
     by some careful testing of bits in CC followed
     by the requisite masking and combining of bits
     from the flag register.

     For now we do it with a switch.  */
  int res;

  switch (cc)
    {
    case EQ:  res = IS_SET (Z);    break;
    case NE:  res = IS_CLEAR (Z);  break;
    case CS:  res = IS_SET (C);    break;
    case CC:  res = IS_CLEAR (C);  break;
    case MI:  res = IS_SET (N);    break;
    case PL:  res = IS_CLEAR (N);  break;
    case VS:  res = IS_SET (V);    break;
    case VC:  res = IS_CLEAR (V);  break;
    case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
    case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
    case GE:  res = IS_SET (N) == IS_SET (V);    break;
    case LT:  res = IS_SET (N) != IS_SET (V);    break;
    case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
    case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
    case AL:
    case NV:
    default:
      res = 1;
      break;
    }
  return res;
}

static void
CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
{
  /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30]    = compare with positive (1) or negative value (0)
     instr[29,21] = 1 1101 0010
     instr[20,16] = Rm or const
     instr[15,12] = cond
     instr[11]    = compare reg (0) or const (1)
     instr[10]    = 0
     instr[9,5]   = Rn
     instr[4]     = 0
     instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
  signed int negate;
  unsigned rm;
  unsigned rn;

  NYI_assert (29, 21, 0x1d2);
  NYI_assert (10, 10, 0);
  NYI_assert (4, 4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (! testConditionCode (cpu, INSTR (15, 12)))
    {
      aarch64_set_CPSR (cpu, INSTR (3, 0));
      return;
    }

  negate = INSTR (30, 30) ? 1 : -1;
  rm = INSTR (20, 16);
  rn = INSTR ( 9,  5);

  if (INSTR (31, 31))
    {
      if (INSTR (11, 11))
	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
			     negate * (uint64_t) rm);
      else
	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
			     negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
    }
  else
    {
      if (INSTR (11, 11))
	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
			     negate * rm);
      else
	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
			     negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
    }
}

static void
do_vec_MOV_whole_vector (sim_cpu *cpu)
{
  /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)

     instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,21] = 001110101
     instr[20,16] = Vs
     instr[15,10] = 000111
     instr[9,5]   = Vs
     instr[4,0]   = Vd  */

  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);

  NYI_assert (29, 21, 0x075);
  NYI_assert (15, 10, 0x07);

  if (INSTR (20, 16) != vs)
    HALT_NYI;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (30, 30))
    aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));

  aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
}

static void
do_vec_SMOV_into_scalar (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = word(0)/long(1)
     instr[29,21] = 00 1110 000
     instr[20,16] = element size and index
     instr[15,10] = 00 0010 11
     instr[9,5]   = V source
     instr[4,0]   = R dest  */

  unsigned vs = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned imm5 = INSTR (20, 16);
  unsigned full = INSTR (30, 30);
  int size, index;

  NYI_assert (29, 21, 0x070);
  NYI_assert (15, 10, 0x0B);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);

  if (imm5 & 0x1)
    {
      size = 0;
      index = (imm5 >> 1) & 0xF;
    }
  else if (imm5 & 0x2)
    {
      size = 1;
      index = (imm5 >> 2) & 0x7;
    }
  else if (full && (imm5 & 0x4))
    {
      size = 2;
      index = (imm5 >> 3) & 0x3;
    }
  else
    HALT_UNALLOC;

  switch (size)
    {
    case 0:
      if (full)
	aarch64_set_reg_s64 (cpu, rd, NO_SP,
			     aarch64_get_vec_s8 (cpu, vs, index));
      else
	aarch64_set_reg_s32 (cpu, rd, NO_SP,
			     aarch64_get_vec_s8 (cpu, vs, index));
      break;

    case 1:
      if (full)
	aarch64_set_reg_s64 (cpu, rd, NO_SP,
			     aarch64_get_vec_s16 (cpu, vs, index));
      else
	aarch64_set_reg_s32 (cpu, rd, NO_SP,
			     aarch64_get_vec_s16 (cpu, vs, index));
      break;

    case 2:
      aarch64_set_reg_s64 (cpu, rd, NO_SP,
			   aarch64_get_vec_s32 (cpu, vs, index));
      break;

    default:
      HALT_UNALLOC;
    }
}

static void
do_vec_UMOV_into_scalar (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = word(0)/long(1)
     instr[29,21] = 00 1110 000
     instr[20,16] = element size and index
     instr[15,10] = 00 0011 11
     instr[9,5]   = V source
     instr[4,0]   = R dest  */

  unsigned vs = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned imm5 = INSTR (20, 16);
  unsigned full = INSTR (30, 30);
  int size, index;

  NYI_assert (29, 21, 0x070);
  NYI_assert (15, 10, 0x0F);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);

  if (!full)
    {
      if (imm5 & 0x1)
	{
	  size = 0;
	  index = (imm5 >> 1) & 0xF;
	}
      else if (imm5 & 0x2)
	{
	  size = 1;
	  index = (imm5 >> 2) & 0x7;
	}
      else if (imm5 & 0x4)
	{
	  size = 2;
	  index = (imm5 >> 3) & 0x3;
	}
      else
	HALT_UNALLOC;
    }
  else if (imm5 & 0x8)
    {
      size = 3;
      index = (imm5 >> 4) & 0x1;
    }
  else
    HALT_UNALLOC;

  switch (size)
    {
    case 0:
      aarch64_set_reg_u32 (cpu, rd, NO_SP,
			   aarch64_get_vec_u8 (cpu, vs, index));
      break;

    case 1:
      aarch64_set_reg_u32 (cpu, rd, NO_SP,
			   aarch64_get_vec_u16 (cpu, vs, index));
      break;

    case 2:
      aarch64_set_reg_u32 (cpu, rd, NO_SP,
			   aarch64_get_vec_u32 (cpu, vs, index));
      break;

    case 3:
      aarch64_set_reg_u64 (cpu, rd, NO_SP,
			   aarch64_get_vec_u64 (cpu, vs, index));
      break;

    default:
      HALT_UNALLOC;
    }
}

static void
do_vec_INS (sim_cpu *cpu)
{
  /* instr[31,21] = 01001110000
     instr[20,16] = element size and index
     instr[15,10] = 000111
     instr[9,5]   = W source
     instr[4,0]   = V dest  */

  int index;
  unsigned rs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);

  NYI_assert (31, 21, 0x270);
  NYI_assert (15, 10, 0x07);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (16, 16))
    {
      index = INSTR (20, 17);
      aarch64_set_vec_u8 (cpu, vd, index,
			  aarch64_get_reg_u8 (cpu, rs, NO_SP));
    }
  else if (INSTR (17, 17))
    {
      index = INSTR (20, 18);
      aarch64_set_vec_u16 (cpu, vd, index,
			   aarch64_get_reg_u16 (cpu, rs, NO_SP));
    }
  else if (INSTR (18, 18))
    {
      index = INSTR (20, 19);
      aarch64_set_vec_u32 (cpu, vd, index,
			   aarch64_get_reg_u32 (cpu, rs, NO_SP));
    }
  else if (INSTR (19, 19))
    {
      index = INSTR (20, 20);
      aarch64_set_vec_u64 (cpu, vd, index,
			   aarch64_get_reg_u64 (cpu, rs, NO_SP));
    }
  else
    HALT_NYI;
}

static void
do_vec_DUP_vector_into_vector (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,21] = 00 1110 000
     instr[20,16] = element size and index
     instr[15,10] = 0000 01
     instr[9,5]   = V source
     instr[4,0]   = V dest.  */

  unsigned full = INSTR (30, 30);
  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  int i, index;

  NYI_assert (29, 21, 0x070);
  NYI_assert (15, 10, 0x01);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (16, 16))
    {
      index = INSTR (20, 17);

      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
    }
  else if (INSTR (17, 17))
    {
      index = INSTR (20, 18);

      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
    }
  else if (INSTR (18, 18))
    {
      index = INSTR (20, 19);

      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
    }
  else
    {
      if (INSTR (19, 19) == 0)
	HALT_UNALLOC;

      if (! full)
	HALT_UNALLOC;

      index = INSTR (20, 20);

      for (i = 0; i < 2; i++)
	aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
    }
}

static void
do_vec_TBL (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,21] = 00 1110 000
     instr[20,16] = Vm
     instr[15]    = 0
     instr[14,13] = vec length
     instr[12,10] = 000
     instr[9,5]   = V start
     instr[4,0]   = V dest  */

  int full    = INSTR (30, 30);
  int len     = INSTR (14, 13) + 1;
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 21, 0x070);
  NYI_assert (12, 10, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 16 : 8); i++)
    {
      unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
      uint8_t val;

      if (selector < 16)
	val = aarch64_get_vec_u8 (cpu, vn, selector);
      else if (selector < 32)
	val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
      else if (selector < 48)
	val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
      else if (selector < 64)
	val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
      else
	val = 0;

      aarch64_set_vec_u8 (cpu, vd, i, val);
    }
}

static void
do_vec_TRN (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,24] = 00 1110
     instr[23,22] = size
     instr[21]    = 0
     instr[20,16] = Vm
     instr[15]    = 0
     instr[14]    = TRN1 (0) / TRN2 (1)
     instr[13,10] = 1010
     instr[9,5]   = V source
     instr[4,0]   = V dest.  */

  int full    = INSTR (30, 30);
  int second  = INSTR (14, 14);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (13, 10, 0xA);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 8 : 4); i++)
	{
	  aarch64_set_vec_u8
	    (cpu, vd, i * 2,
	     aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
	  aarch64_set_vec_u8
	    (cpu, vd, 1 * 2 + 1,
	     aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
	}
      break;

    case 1:
      for (i = 0; i < (full ? 4 : 2); i++)
	{
	  aarch64_set_vec_u16
	    (cpu, vd, i * 2,
	     aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
	  aarch64_set_vec_u16
	    (cpu, vd, 1 * 2 + 1,
	     aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
	}
      break;

    case 2:
      aarch64_set_vec_u32
	(cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
      aarch64_set_vec_u32
	(cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
      aarch64_set_vec_u32
	(cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
      aarch64_set_vec_u32
	(cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
      break;

    case 3:
      if (! full)
	HALT_UNALLOC;

      aarch64_set_vec_u64 (cpu, vd, 0,
			   aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
      aarch64_set_vec_u64 (cpu, vd, 1,
			   aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
      break;
    }
}

static void
do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
                    [must be 1 for 64-bit xfer]
     instr[29,20] = 00 1110 0000
     instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
                                  0100=> 32-bits. 1000=>64-bits
     instr[15,10] = 0000 11
     instr[9,5]   = W source
     instr[4,0]   = V dest.  */

  unsigned i;
  unsigned Vd = INSTR (4, 0);
  unsigned Rs = INSTR (9, 5);
  int both    = INSTR (30, 30);

  NYI_assert (29, 20, 0x0E0);
  NYI_assert (15, 10, 0x03);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (19, 16))
    {
    case 1:
      for (i = 0; i < (both ? 16 : 8); i++)
	aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
      break;

    case 2:
      for (i = 0; i < (both ? 8 : 4); i++)
	aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
      break;

    case 4:
      for (i = 0; i < (both ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
      break;

    case 8:
      if (!both)
	HALT_NYI;
      aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
      aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
      break;

    default:
      HALT_NYI;
    }
}

static void
do_vec_UZP (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,24] = 00 1110
     instr[23,22] = size: byte(00), half(01), word (10), long (11)
     instr[21]    = 0
     instr[20,16] = Vm
     instr[15]    = 0
     instr[14]    = lower (0) / upper (1)
     instr[13,10] = 0110
     instr[9,5]   = Vn
     instr[4,0]   = Vd.  */

  int full = INSTR (30, 30);
  int upper = INSTR (14, 14);

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);

  uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
  uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
  uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
  uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);

  uint64_t val1;
  uint64_t val2;

  uint64_t input2 = full ? val_n2 : val_m1;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 0);
  NYI_assert (15, 15, 0);
  NYI_assert (13, 10, 6);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
      val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
      val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
      val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;

      val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
      val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
      val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
      val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;

      if (full)
	{
	  val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
	  val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
	  val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
	  val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;

	  val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
	  val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
	  val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
	  val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
	}
      break;

    case 1:
      val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
      val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;

      val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
      val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;

      if (full)
	{
	  val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
	  val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;

	  val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
	  val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
	}
      break;

    case 2:
      val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
      val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;

      if (full)
	{
	  val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
	  val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
	}
      break;

    case 3:
      if (! full)
	HALT_UNALLOC;

      val1 = upper ? val_n2 : val_n1;
      val2 = upper ? val_m2 : val_m1;
      break;
    }

  aarch64_set_vec_u64 (cpu, vd, 0, val1);
  if (full)
    aarch64_set_vec_u64 (cpu, vd, 1, val2);
}

static void
do_vec_ZIP (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,24] = 00 1110
     instr[23,22] = size: byte(00), hald(01), word (10), long (11)
     instr[21]    = 0
     instr[20,16] = Vm
     instr[15]    = 0
     instr[14]    = lower (0) / upper (1)
     instr[13,10] = 1110
     instr[9,5]   = Vn
     instr[4,0]   = Vd.  */

  int full = INSTR (30, 30);
  int upper = INSTR (14, 14);

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);

  uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
  uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
  uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
  uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);

  uint64_t val1 = 0;
  uint64_t val2 = 0;

  uint64_t input1 = upper ? val_n1 : val_m1;
  uint64_t input2 = upper ? val_n2 : val_m2;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 0);
  NYI_assert (15, 15, 0);
  NYI_assert (13, 10, 0xE);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 23))
    {
    case 0:
      val1 =
	  ((input1 <<  0) & (0xFF    <<  0))
	| ((input2 <<  8) & (0xFF    <<  8))
	| ((input1 <<  8) & (0xFF    << 16))
	| ((input2 << 16) & (0xFF    << 24))
	| ((input1 << 16) & (0xFFULL << 32))
	| ((input2 << 24) & (0xFFULL << 40))
	| ((input1 << 24) & (0xFFULL << 48))
	| ((input2 << 32) & (0xFFULL << 56));

      val2 =
	  ((input1 >> 32) & (0xFF    <<  0))
	| ((input2 >> 24) & (0xFF    <<  8))
	| ((input1 >> 24) & (0xFF    << 16))
	| ((input2 >> 16) & (0xFF    << 24))
	| ((input1 >> 16) & (0xFFULL << 32))
	| ((input2 >>  8) & (0xFFULL << 40))
	| ((input1 >>  8) & (0xFFULL << 48))
	| ((input2 >>  0) & (0xFFULL << 56));
      break;

    case 1:
      val1 =
	  ((input1 <<  0) & (0xFFFF    <<  0))
	| ((input2 << 16) & (0xFFFF    << 16))
	| ((input1 << 16) & (0xFFFFULL << 32))
	| ((input2 << 32) & (0xFFFFULL << 48));

      val2 =
	  ((input1 >> 32) & (0xFFFF    <<  0))
	| ((input2 >> 16) & (0xFFFF    << 16))
	| ((input1 >> 16) & (0xFFFFULL << 32))
	| ((input2 >>  0) & (0xFFFFULL << 48));
      break;

    case 2:
      val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
      val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
      break;

    case 3:
      val1 = input1;
      val2 = input2;
      break;
    }

  aarch64_set_vec_u64 (cpu, vd, 0, val1);
  if (full)
    aarch64_set_vec_u64 (cpu, vd, 1, val2);
}

/* Floating point immediates are encoded in 8 bits.
   fpimm[7] = sign bit.
   fpimm[6:4] = signed exponent.
   fpimm[3:0] = fraction (assuming leading 1).
   i.e. F = s * 1.f * 2^(e - b).  */

static float
fp_immediate_for_encoding_32 (uint32_t imm8)
{
  float u;
  uint32_t s, e, f, i;

  s = (imm8 >> 7) & 0x1;
  e = (imm8 >> 4) & 0x7;
  f = imm8 & 0xf;

  /* The fp value is s * n/16 * 2r where n is 16+e.  */
  u = (16.0 + f) / 16.0;

  /* N.B. exponent is signed.  */
  if (e < 4)
    {
      int epos = e;

      for (i = 0; i <= epos; i++)
	u *= 2.0;
    }
  else
    {
      int eneg = 7 - e;

      for (i = 0; i < eneg; i++)
	u /= 2.0;
    }

  if (s)
    u = - u;

  return u;
}

static double
fp_immediate_for_encoding_64 (uint32_t imm8)
{
  double u;
  uint32_t s, e, f, i;

  s = (imm8 >> 7) & 0x1;
  e = (imm8 >> 4) & 0x7;
  f = imm8 & 0xf;

  /* The fp value is s * n/16 * 2r where n is 16+e.  */
  u = (16.0 + f) / 16.0;

  /* N.B. exponent is signed.  */
  if (e < 4)
    {
      int epos = e;

      for (i = 0; i <= epos; i++)
	u *= 2.0;
    }
  else
    {
      int eneg = 7 - e;

      for (i = 0; i < eneg; i++)
	u /= 2.0;
    }

  if (s)
    u = - u;

  return u;
}

static void
do_vec_MOV_immediate (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29,19] = 00111100000
     instr[18,16] = high 3 bits of uimm8
     instr[15,12] = size & shift:
                                  0000 => 32-bit
                                  0010 => 32-bit + LSL#8
                                  0100 => 32-bit + LSL#16
                                  0110 => 32-bit + LSL#24
                                  1010 => 16-bit + LSL#8
                                  1000 => 16-bit
                                  1101 => 32-bit + MSL#16
                                  1100 => 32-bit + MSL#8
                                  1110 => 8-bit
                                  1111 => double
     instr[11,10] = 01
     instr[9,5]   = low 5-bits of uimm8
     instr[4,0]   = Vd.  */

  int full     = INSTR (30, 30);
  unsigned vd  = INSTR (4, 0);
  unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
  unsigned i;

  NYI_assert (29, 19, 0x1E0);
  NYI_assert (11, 10, 1);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (15, 12))
    {
    case 0x0: /* 32-bit, no shift.  */
    case 0x2: /* 32-bit, shift by 8.  */
    case 0x4: /* 32-bit, shift by 16.  */
    case 0x6: /* 32-bit, shift by 24.  */
      val <<= (8 * INSTR (14, 13));
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i, val);
      break;

    case 0xa: /* 16-bit, shift by 8.  */
      val <<= 8;
      /* Fall through.  */
    case 0x8: /* 16-bit, no shift.  */
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_u16 (cpu, vd, i, val);
      break;

    case 0xd: /* 32-bit, mask shift by 16.  */
      val <<= 8;
      val |= 0xFF;
      /* Fall through.  */
    case 0xc: /* 32-bit, mask shift by 8. */
      val <<= 8;
      val |= 0xFF;
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i, val);
      break;

    case 0xe: /* 8-bit, no shift.  */
      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_u8 (cpu, vd, i, val);
      break;

    case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
      {
	float u = fp_immediate_for_encoding_32 (val);
	for (i = 0; i < (full ? 4 : 2); i++)
	  aarch64_set_vec_float (cpu, vd, i, u);
	break;
      }

    default:
      HALT_NYI;
    }
}

static void
do_vec_MVNI (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29,19] = 10111100000
     instr[18,16] = high 3 bits of uimm8
     instr[15,12] = selector
     instr[11,10] = 01
     instr[9,5]   = low 5-bits of uimm8
     instr[4,0]   = Vd.  */

  int full     = INSTR (30, 30);
  unsigned vd  = INSTR (4, 0);
  unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
  unsigned i;

  NYI_assert (29, 19, 0x5E0);
  NYI_assert (11, 10, 1);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (15, 12))
    {
    case 0x0: /* 32-bit, no shift.  */
    case 0x2: /* 32-bit, shift by 8.  */
    case 0x4: /* 32-bit, shift by 16.  */
    case 0x6: /* 32-bit, shift by 24.  */
      val <<= (8 * INSTR (14, 13));
      val = ~ val;
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i, val);
      return;

    case 0xa: /* 16-bit, 8 bit shift. */
      val <<= 8;
    case 0x8: /* 16-bit, no shift. */
      val = ~ val;
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_u16 (cpu, vd, i, val);
      return;

    case 0xd: /* 32-bit, mask shift by 16.  */
      val <<= 8;
      val |= 0xFF;
    case 0xc: /* 32-bit, mask shift by 8. */
      val <<= 8;
      val |= 0xFF;
      val = ~ val;
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i, val);
      return;

    case 0xE: /* MOVI Dn, #mask64 */
      {
	uint64_t mask = 0;

	for (i = 0; i < 8; i++)
	  if (val & (1 << i))
	    mask |= (0xFFUL << (i * 8));
	aarch64_set_vec_u64 (cpu, vd, 0, mask);
	aarch64_set_vec_u64 (cpu, vd, 1, mask);
	return;
      }

    case 0xf: /* FMOV Vd.2D, #fpimm.  */
      {
	double u = fp_immediate_for_encoding_64 (val);

	if (! full)
	  HALT_UNALLOC;

	aarch64_set_vec_double (cpu, vd, 0, u);
	aarch64_set_vec_double (cpu, vd, 1, u);
	return;
      }

    default:
      HALT_NYI;
    }
}

#define ABS(A) ((A) < 0 ? - (A) : (A))

static void
do_vec_ABS (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,24] = 00 1110
     instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
     instr[21,10] = 10 0000 1011 10
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  unsigned i;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 10, 0x82E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_s8 (cpu, vd, i,
			    ABS (aarch64_get_vec_s8 (cpu, vn, i)));
      break;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_s16 (cpu, vd, i,
			     ABS (aarch64_get_vec_s16 (cpu, vn, i)));
      break;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_s32 (cpu, vd, i,
			     ABS (aarch64_get_vec_s32 (cpu, vn, i)));
      break;

    case 3:
      if (! full)
	HALT_NYI;
      for (i = 0; i < 2; i++)
	aarch64_set_vec_s64 (cpu, vd, i,
			     ABS (aarch64_get_vec_s64 (cpu, vn, i)));
      break;
    }
}

static void
do_vec_ADDV (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29,24] = 00 1110
     instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
     instr[21,10] = 11 0001 1011 10
     instr[9,5]   = Vm
     instr[4.0]   = Rd.  */

  unsigned vm = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 10, 0xC6E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      {
	uint8_t val = 0;
	for (i = 0; i < (full ? 16 : 8); i++)
	  val += aarch64_get_vec_u8 (cpu, vm, i);
	aarch64_set_vec_u64 (cpu, rd, 0, val);
	return;
      }

    case 1:
      {
	uint16_t val = 0;
	for (i = 0; i < (full ? 8 : 4); i++)
	  val += aarch64_get_vec_u16 (cpu, vm, i);
	aarch64_set_vec_u64 (cpu, rd, 0, val);
	return;
      }

    case 2:
      {
	uint32_t val = 0;
	if (! full)
	  HALT_UNALLOC;
	for (i = 0; i < 4; i++)
	  val += aarch64_get_vec_u32 (cpu, vm, i);
	aarch64_set_vec_u64 (cpu, rd, 0, val);
	return;
      }

    case 3:
      HALT_UNALLOC;
    }
}

static void
do_vec_ins_2 (sim_cpu *cpu)
{
  /* instr[31,21] = 01001110000
     instr[20,18] = size & element selector
     instr[17,14] = 0000
     instr[13]    = direction: to vec(0), from vec (1)
     instr[12,10] = 111
     instr[9,5]   = Vm
     instr[4,0]   = Vd.  */

  unsigned elem;
  unsigned vm = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);

  NYI_assert (31, 21, 0x270);
  NYI_assert (17, 14, 0);
  NYI_assert (12, 10, 7);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (13, 13) == 1)
    {
      if (INSTR (18, 18) == 1)
	{
	  /* 32-bit moves.  */
	  elem = INSTR (20, 19);
	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
			       aarch64_get_vec_u32 (cpu, vm, elem));
	}
      else
	{
	  /* 64-bit moves.  */
	  if (INSTR (19, 19) != 1)
	    HALT_NYI;

	  elem = INSTR (20, 20);
	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
			       aarch64_get_vec_u64 (cpu, vm, elem));
	}
    }
  else
    {
      if (INSTR (18, 18) == 1)
	{
	  /* 32-bit moves.  */
	  elem = INSTR (20, 19);
	  aarch64_set_vec_u32 (cpu, vd, elem,
			       aarch64_get_reg_u32 (cpu, vm, NO_SP));
	}
      else
	{
	  /* 64-bit moves.  */
	  if (INSTR (19, 19) != 1)
	    HALT_NYI;

	  elem = INSTR (20, 20);
	  aarch64_set_vec_u64 (cpu, vd, elem,
			       aarch64_get_reg_u64 (cpu, vm, NO_SP));
	}
    }
}

#define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)	  \
  do								  \
    {								  \
      DST_TYPE a[N], b[N];					  \
								  \
      for (i = 0; i < (N); i++)					  \
	{							  \
	  a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
	  b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
	}							  \
      for (i = 0; i < (N); i++)					  \
	aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);	  \
    }								  \
  while (0)

static void
do_vec_mull (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = lower(0)/upper(1) selector
     instr[29]    = signed(0)/unsigned(1)
     instr[28,24] = 0 1110
     instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
     instr[21]    = 1
     instr[20,16] = Vm
     instr[15,10] = 11 0000
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  int    unsign = INSTR (29, 29);
  int    bias = INSTR (30, 30);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR ( 9,  5);
  unsigned vd = INSTR ( 4,  0);
  unsigned i;

  NYI_assert (28, 24, 0x0E);
  NYI_assert (15, 10, 0x30);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* NB: Read source values before writing results, in case
     the source and destination vectors are the same.  */
  switch (INSTR (23, 22))
    {
    case 0:
      if (bias)
	bias = 8;
      if (unsign)
	DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
      else
	DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
      return;

    case 1:
      if (bias)
	bias = 4;
      if (unsign)
	DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
      else
	DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
      return;

    case 2:
      if (bias)
	bias = 2;
      if (unsign)
	DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
      else
	DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
      return;

    case 3:
      HALT_NYI;
    }
}

static void
do_vec_fadd (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,24] = 001110
     instr[23]    = FADD(0)/FSUB(1)
     instr[22]    = float (0)/double(1)
     instr[21]    = 1
     instr[20,16] = Vm
     instr[15,10] = 110101
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x35);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (23, 23))
    {
      if (INSTR (22, 22))
	{
	  if (! full)
	    HALT_NYI;

	  for (i = 0; i < 2; i++)
	    aarch64_set_vec_double (cpu, vd, i,
				    aarch64_get_vec_double (cpu, vn, i)
				    - aarch64_get_vec_double (cpu, vm, i));
	}
      else
	{
	  for (i = 0; i < (full ? 4 : 2); i++)
	    aarch64_set_vec_float (cpu, vd, i,
				   aarch64_get_vec_float (cpu, vn, i)
				   - aarch64_get_vec_float (cpu, vm, i));
	}
    }
  else
    {
      if (INSTR (22, 22))
	{
	  if (! full)
	    HALT_NYI;

	  for (i = 0; i < 2; i++)
	    aarch64_set_vec_double (cpu, vd, i,
				    aarch64_get_vec_double (cpu, vm, i)
				    + aarch64_get_vec_double (cpu, vn, i));
	}
      else
	{
	  for (i = 0; i < (full ? 4 : 2); i++)
	    aarch64_set_vec_float (cpu, vd, i,
				   aarch64_get_vec_float (cpu, vm, i)
				   + aarch64_get_vec_float (cpu, vn, i));
	}
    }
}

static void
do_vec_add (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29,24] = 001110
     instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
     instr[21]    = 1
     instr[20,16] = Vn
     instr[15,10] = 100001
     instr[9,5]   = Vm
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x21);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
			    + aarch64_get_vec_u8 (cpu, vm, i));
      return;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
			     + aarch64_get_vec_u16 (cpu, vm, i));
      return;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
			     + aarch64_get_vec_u32 (cpu, vm, i));
      return;

    case 3:
      if (! full)
	HALT_UNALLOC;
      aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
			   + aarch64_get_vec_u64 (cpu, vm, 0));
      aarch64_set_vec_u64 (cpu, vd, 1,
			   aarch64_get_vec_u64 (cpu, vn, 1)
			   + aarch64_get_vec_u64 (cpu, vm, 1));
      return;
    }
}

static void
do_vec_mul (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29,24] = 00 1110
     instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
     instr[21]    = 1
     instr[20,16] = Vn
     instr[15,10] = 10 0111
     instr[9,5]   = Vm
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);
  int      bias = 0;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x27);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
      return;

    case 1:
      DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
      return;

    case 2:
      DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
      return;

    case 3:
      HALT_UNALLOC;
    }
}

static void
do_vec_MLA (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29,24] = 00 1110
     instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
     instr[21]    = 1
     instr[20,16] = Vn
     instr[15,10] = 1001 01
     instr[9,5]   = Vm
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x25);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_u8 (cpu, vd, i,
			    aarch64_get_vec_u8 (cpu, vd, i)
			    + (aarch64_get_vec_u8 (cpu, vn, i)
			       * aarch64_get_vec_u8 (cpu, vm, i)));
      return;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_u16 (cpu, vd, i,
			     aarch64_get_vec_u16 (cpu, vd, i)
			     + (aarch64_get_vec_u16 (cpu, vn, i)
				* aarch64_get_vec_u16 (cpu, vm, i)));
      return;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i,
			     aarch64_get_vec_u32 (cpu, vd, i)
			     + (aarch64_get_vec_u32 (cpu, vn, i)
				* aarch64_get_vec_u32 (cpu, vm, i)));
      return;

    default:
      HALT_UNALLOC;
    }
}

static float
fmaxnm (float a, float b)
{
  if (! isnan (a))
    {
      if (! isnan (b))
	return a > b ? a : b;
      return a;
    }
  else if (! isnan (b))
    return b;
  return a;
}

static float
fminnm (float a, float b)
{
  if (! isnan (a))
    {
      if (! isnan (b))
	return a < b ? a : b;
      return a;
    }
  else if (! isnan (b))
    return b;
  return a;
}

static double
dmaxnm (double a, double b)
{
  if (! isnan (a))
    {
      if (! isnan (b))
	return a > b ? a : b;
      return a;
    }
  else if (! isnan (b))
    return b;
  return a;
}

static double
dminnm (double a, double b)
{
  if (! isnan (a))
    {
      if (! isnan (b))
	return a < b ? a : b;
      return a;
    }
  else if (! isnan (b))
    return b;
  return a;
}

static void
do_vec_FminmaxNMP (sim_cpu *cpu)
{
  /* instr [31]    = 0
     instr [30]    = half (0)/full (1)
     instr [29,24] = 10 1110
     instr [23]    = max(0)/min(1)
     instr [22]    = float (0)/double (1)
     instr [21]    = 1
     instr [20,16] = Vn
     instr [15,10] = 1100 01
     instr [9,5]   = Vm
     instr [4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  int      full = INSTR (30, 30);

  NYI_assert (29, 24, 0x2E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x31);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      double (* fn)(double, double) = INSTR (23, 23)
	? dminnm : dmaxnm;

      if (! full)
	HALT_NYI;
      aarch64_set_vec_double (cpu, vd, 0,
			      fn (aarch64_get_vec_double (cpu, vn, 0),
				  aarch64_get_vec_double (cpu, vn, 1)));
      aarch64_set_vec_double (cpu, vd, 0,
			      fn (aarch64_get_vec_double (cpu, vm, 0),
				  aarch64_get_vec_double (cpu, vm, 1)));
    }
  else
    {
      float (* fn)(float, float) = INSTR (23, 23)
	? fminnm : fmaxnm;

      aarch64_set_vec_float (cpu, vd, 0,
			     fn (aarch64_get_vec_float (cpu, vn, 0),
				 aarch64_get_vec_float (cpu, vn, 1)));
      if (full)
	aarch64_set_vec_float (cpu, vd, 1,
			       fn (aarch64_get_vec_float (cpu, vn, 2),
				   aarch64_get_vec_float (cpu, vn, 3)));

      aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
			     fn (aarch64_get_vec_float (cpu, vm, 0),
				 aarch64_get_vec_float (cpu, vm, 1)));
      if (full)
	aarch64_set_vec_float (cpu, vd, 3,
			       fn (aarch64_get_vec_float (cpu, vm, 2),
				   aarch64_get_vec_float (cpu, vm, 3)));
    }
}

static void
do_vec_AND (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,21] = 001110001
     instr[20,16] = Vm
     instr[15,10] = 000111
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 21, 0x071);
  NYI_assert (15, 10, 0x07);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 4 : 2); i++)
    aarch64_set_vec_u32 (cpu, vd, i,
			 aarch64_get_vec_u32 (cpu, vn, i)
			 & aarch64_get_vec_u32 (cpu, vm, i));
}

static void
do_vec_BSL (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,21] = 101110011
     instr[20,16] = Vm
     instr[15,10] = 000111
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 21, 0x173);
  NYI_assert (15, 10, 0x07);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 16 : 8); i++)
    aarch64_set_vec_u8 (cpu, vd, i,
			(    aarch64_get_vec_u8 (cpu, vd, i)
			   & aarch64_get_vec_u8 (cpu, vn, i))
			| ((~ aarch64_get_vec_u8 (cpu, vd, i))
			   & aarch64_get_vec_u8 (cpu, vm, i)));
}

static void
do_vec_EOR (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,21] = 10 1110 001
     instr[20,16] = Vm
     instr[15,10] = 000111
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 21, 0x171);
  NYI_assert (15, 10, 0x07);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 4 : 2); i++)
    aarch64_set_vec_u32 (cpu, vd, i,
			 aarch64_get_vec_u32 (cpu, vn, i)
			 ^ aarch64_get_vec_u32 (cpu, vm, i));
}

static void
do_vec_bit (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,23] = 10 1110 1
     instr[22]    = BIT (0) / BIF (1)
     instr[21]    = 1
     instr[20,16] = Vm
     instr[15,10] = 0001 11
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  unsigned test_false = INSTR (22, 22);
  unsigned i;

  NYI_assert (29, 23, 0x5D);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x07);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 4 : 2); i++)
    {
      uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
      uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
      uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
      if (test_false)
	aarch64_set_vec_u32 (cpu, vd, i,
			     (vd_val & vm_val) | (vn_val & ~vm_val));
      else
	aarch64_set_vec_u32 (cpu, vd, i,
			     (vd_val & ~vm_val) | (vn_val & vm_val));
    }
}

static void
do_vec_ORN (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,21] = 00 1110 111
     instr[20,16] = Vm
     instr[15,10] = 00 0111
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 21, 0x077);
  NYI_assert (15, 10, 0x07);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 16 : 8); i++)
    aarch64_set_vec_u8 (cpu, vd, i,
			aarch64_get_vec_u8 (cpu, vn, i)
			| ~ aarch64_get_vec_u8 (cpu, vm, i));
}

static void
do_vec_ORR (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,21] = 00 1110 101
     instr[20,16] = Vm
     instr[15,10] = 0001 11
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 21, 0x075);
  NYI_assert (15, 10, 0x07);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 16 : 8); i++)
    aarch64_set_vec_u8 (cpu, vd, i,
			aarch64_get_vec_u8 (cpu, vn, i)
			| aarch64_get_vec_u8 (cpu, vm, i));
}

static void
do_vec_BIC (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,21] = 00 1110 011
     instr[20,16] = Vm
     instr[15,10] = 00 0111
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 21, 0x073);
  NYI_assert (15, 10, 0x07);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 16 : 8); i++)
    aarch64_set_vec_u8 (cpu, vd, i,
			aarch64_get_vec_u8 (cpu, vn, i)
			& ~ aarch64_get_vec_u8 (cpu, vm, i));
}

static void
do_vec_XTN (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = first part (0)/ second part (1)
     instr[29,24] = 00 1110
     instr[23,22] = size: byte(00), half(01), word (10)
     instr[21,10] = 1000 0100 1010
     instr[9,5]   = Vs
     instr[4,0]   = Vd.  */

  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned bias = INSTR (30, 30);
  unsigned i;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 10, 0x84A);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < 8; i++)
	aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
			    aarch64_get_vec_u16 (cpu, vs, i));
      return;

    case 1:
      for (i = 0; i < 4; i++)
	aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
			     aarch64_get_vec_u32 (cpu, vs, i));
      return;

    case 2:
      for (i = 0; i < 2; i++)
	aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
			     aarch64_get_vec_u64 (cpu, vs, i));
      return;
    }
}

/* Return the number of bits set in the input value.  */
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
# define popcount __builtin_popcount
#else
static int
popcount (unsigned char x)
{
  static const unsigned char popcnt[16] =
    {
      0, 1, 1, 2,
      1, 2, 2, 3,
      1, 2, 2, 3,
      2, 3, 3, 4
    };

  /* Only counts the low 8 bits of the input as that is all we need.  */
  return popcnt[x % 16] + popcnt[x / 16];
}
#endif

static void
do_vec_CNT (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/ full (1)
     instr[29,24] = 00 1110
     instr[23,22] = size: byte(00)
     instr[21,10] = 1000 0001 0110
     instr[9,5]   = Vs
     instr[4,0]   = Vd.  */

  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  int full = INSTR (30, 30);
  int size = INSTR (23, 22);
  int i;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 10, 0x816);

  if (size != 0)
    HALT_UNALLOC;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);

  for (i = 0; i < (full ? 16 : 8); i++)
    aarch64_set_vec_u8 (cpu, vd, i,
			popcount (aarch64_get_vec_u8 (cpu, vs, i)));
}

static void
do_vec_maxv (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29]    = signed (0)/unsigned(1)
     instr[28,24] = 0 1110
     instr[23,22] = size: byte(00), half(01), word (10)
     instr[21]    = 1
     instr[20,17] = 1 000
     instr[16]    = max(0)/min(1)
     instr[15,10] = 1010 10
     instr[9,5]   = V source
     instr[4.0]   = R dest.  */

  unsigned vs = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  unsigned i;

  NYI_assert (28, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (20, 17, 8);
  NYI_assert (15, 10, 0x2A);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
    {
    case 0: /* SMAXV.  */
       {
	int64_t smax;
	switch (INSTR (23, 22))
	  {
	  case 0:
	    smax = aarch64_get_vec_s8 (cpu, vs, 0);
	    for (i = 1; i < (full ? 16 : 8); i++)
	      smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
	    break;
	  case 1:
	    smax = aarch64_get_vec_s16 (cpu, vs, 0);
	    for (i = 1; i < (full ? 8 : 4); i++)
	      smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
	    break;
	  case 2:
	    smax = aarch64_get_vec_s32 (cpu, vs, 0);
	    for (i = 1; i < (full ? 4 : 2); i++)
	      smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
	    break;
	  case 3:
	    HALT_UNALLOC;
	  }
	aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
	return;
      }

    case 1: /* SMINV.  */
      {
	int64_t smin;
	switch (INSTR (23, 22))
	  {
	  case 0:
	    smin = aarch64_get_vec_s8 (cpu, vs, 0);
	    for (i = 1; i < (full ? 16 : 8); i++)
	      smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
	    break;
	  case 1:
	    smin = aarch64_get_vec_s16 (cpu, vs, 0);
	    for (i = 1; i < (full ? 8 : 4); i++)
	      smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
	    break;
	  case 2:
	    smin = aarch64_get_vec_s32 (cpu, vs, 0);
	    for (i = 1; i < (full ? 4 : 2); i++)
	      smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
	    break;

	  case 3:
	    HALT_UNALLOC;
	  }
	aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
	return;
      }

    case 2: /* UMAXV.  */
      {
	uint64_t umax;
	switch (INSTR (23, 22))
	  {
	  case 0:
	    umax = aarch64_get_vec_u8 (cpu, vs, 0);
	    for (i = 1; i < (full ? 16 : 8); i++)
	      umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
	    break;
	  case 1:
	    umax = aarch64_get_vec_u16 (cpu, vs, 0);
	    for (i = 1; i < (full ? 8 : 4); i++)
	      umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
	    break;
	  case 2:
	    umax = aarch64_get_vec_u32 (cpu, vs, 0);
	    for (i = 1; i < (full ? 4 : 2); i++)
	      umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
	    break;

	  case 3:
	    HALT_UNALLOC;
	  }
	aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
	return;
      }

    case 3: /* UMINV.  */
      {
	uint64_t umin;
	switch (INSTR (23, 22))
	  {
	  case 0:
	    umin = aarch64_get_vec_u8 (cpu, vs, 0);
	    for (i = 1; i < (full ? 16 : 8); i++)
	      umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
	    break;
	  case 1:
	    umin = aarch64_get_vec_u16 (cpu, vs, 0);
	    for (i = 1; i < (full ? 8 : 4); i++)
	      umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
	    break;
	  case 2:
	    umin = aarch64_get_vec_u32 (cpu, vs, 0);
	    for (i = 1; i < (full ? 4 : 2); i++)
	      umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
	    break;

	  case 3:
	    HALT_UNALLOC;
	  }
	aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
	return;
      }
    }
}

static void
do_vec_fminmaxV (sim_cpu *cpu)
{
  /* instr[31,24] = 0110 1110
     instr[23]    = max(0)/min(1)
     instr[22,14] = 011 0000 11
     instr[13,12] = nm(00)/normal(11)
     instr[11,10] = 10
     instr[9,5]   = V source
     instr[4.0]   = R dest.  */

  unsigned vs = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned i;
  float res   = aarch64_get_vec_float (cpu, vs, 0);

  NYI_assert (31, 24, 0x6E);
  NYI_assert (22, 14, 0x0C3);
  NYI_assert (11, 10, 2);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (23, 23))
    {
      switch (INSTR (13, 12))
	{
	case 0: /* FMNINNMV.  */
	  for (i = 1; i < 4; i++)
	    res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
	  break;

	case 3: /* FMINV.  */
	  for (i = 1; i < 4; i++)
	    res = min (res, aarch64_get_vec_float (cpu, vs, i));
	  break;

	default:
	  HALT_NYI;
	}
    }
  else
    {
      switch (INSTR (13, 12))
	{
	case 0: /* FMNAXNMV.  */
	  for (i = 1; i < 4; i++)
	    res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
	  break;

	case 3: /* FMAXV.  */
	  for (i = 1; i < 4; i++)
	    res = max (res, aarch64_get_vec_float (cpu, vs, i));
	  break;

	default:
	  HALT_NYI;
	}
    }

  aarch64_set_FP_float (cpu, rd, res);
}

static void
do_vec_Fminmax (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,24] = 00 1110
     instr[23]    = max(0)/min(1)
     instr[22]    = float(0)/double(1)
     instr[21]    = 1
     instr[20,16] = Vm
     instr[15,14] = 11
     instr[13,12] = nm(00)/normal(11)
     instr[11,10] = 01
     instr[9,5]   = Vn
     instr[4,0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  unsigned min = INSTR (23, 23);
  unsigned i;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 14, 3);
  NYI_assert (11, 10, 1);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      double (* func)(double, double);

      if (! full)
	HALT_NYI;

      if (INSTR (13, 12) == 0)
	func = min ? dminnm : dmaxnm;
      else if (INSTR (13, 12) == 3)
	func = min ? fmin : fmax;
      else
	HALT_NYI;

      for (i = 0; i < 2; i++)
	aarch64_set_vec_double (cpu, vd, i,
				func (aarch64_get_vec_double (cpu, vn, i),
				      aarch64_get_vec_double (cpu, vm, i)));
    }
  else
    {
      float (* func)(float, float);

      if (INSTR (13, 12) == 0)
	func = min ? fminnm : fmaxnm;
      else if (INSTR (13, 12) == 3)
	func = min ? fminf : fmaxf;
      else
	HALT_NYI;

      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_float (cpu, vd, i,
			       func (aarch64_get_vec_float (cpu, vn, i),
				     aarch64_get_vec_float (cpu, vm, i)));
    }
}

static void
do_vec_SCVTF (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = Q
     instr[29,23] = 00 1110 0
     instr[22]    = float(0)/double(1)
     instr[21,10] = 10 0001 1101 10
     instr[9,5]   = Vn
     instr[4,0]   = Vd.  */

  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  unsigned size = INSTR (22, 22);
  unsigned i;

  NYI_assert (29, 23, 0x1C);
  NYI_assert (21, 10, 0x876);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (size)
    {
      if (! full)
	HALT_UNALLOC;

      for (i = 0; i < 2; i++)
	{
	  double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
	  aarch64_set_vec_double (cpu, vd, i, val);
	}
    }
  else
    {
      for (i = 0; i < (full ? 4 : 2); i++)
	{
	  float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
	  aarch64_set_vec_float (cpu, vd, i, val);
	}
    }
}

#define VEC_CMP(SOURCE, CMP)						\
  do									\
    {									\
      switch (size)							\
	{								\
	case 0:								\
	  for (i = 0; i < (full ? 16 : 8); i++)				\
	    aarch64_set_vec_u8 (cpu, vd, i,				\
				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
				CMP					\
				aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
				? -1 : 0);				\
	  return;							\
	case 1:								\
	  for (i = 0; i < (full ? 8 : 4); i++)				\
	    aarch64_set_vec_u16 (cpu, vd, i,				\
				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
				 CMP					\
				 aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
				 ? -1 : 0);				\
	  return;							\
	case 2:								\
	  for (i = 0; i < (full ? 4 : 2); i++)				\
	    aarch64_set_vec_u32 (cpu, vd, i, \
				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
				 CMP					\
				 aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
				 ? -1 : 0);				\
	  return;							\
	case 3:								\
	  if (! full)							\
	    HALT_UNALLOC;						\
	  for (i = 0; i < 2; i++)					\
	    aarch64_set_vec_u64 (cpu, vd, i, \
				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
				 CMP					\
				 aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
				 ? -1ULL : 0);				\
	  return;							\
	}								\
    }									\
  while (0)

#define VEC_CMP0(SOURCE, CMP)						\
  do									\
    {									\
      switch (size)							\
	{								\
	case 0:								\
	  for (i = 0; i < (full ? 16 : 8); i++)				\
	    aarch64_set_vec_u8 (cpu, vd, i,				\
				aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
				CMP 0 ? -1 : 0);			\
	  return;							\
	case 1:								\
	  for (i = 0; i < (full ? 8 : 4); i++)				\
	    aarch64_set_vec_u16 (cpu, vd, i,				\
				 aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
				 CMP 0 ? -1 : 0);			\
	  return;							\
	case 2:								\
	  for (i = 0; i < (full ? 4 : 2); i++)				\
	    aarch64_set_vec_u32 (cpu, vd, i,				\
				 aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
				 CMP 0 ? -1 : 0);			\
	  return;							\
	case 3:								\
	  if (! full)							\
	    HALT_UNALLOC;						\
	  for (i = 0; i < 2; i++)					\
	    aarch64_set_vec_u64 (cpu, vd, i,				\
				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
				 CMP 0 ? -1ULL : 0);			\
	  return;							\
	}								\
    }									\
  while (0)

#define VEC_FCMP0(CMP)							\
  do									\
    {									\
      if (vm != 0)							\
	HALT_NYI;							\
      if (INSTR (22, 22))						\
	{								\
	  if (! full)							\
	    HALT_NYI;							\
	  for (i = 0; i < 2; i++)					\
	    aarch64_set_vec_u64 (cpu, vd, i,				\
				 aarch64_get_vec_double (cpu, vn, i)	\
				 CMP 0.0 ? -1 : 0);			\
	}								\
      else								\
	{								\
	  for (i = 0; i < (full ? 4 : 2); i++)				\
	    aarch64_set_vec_u32 (cpu, vd, i,				\
				 aarch64_get_vec_float (cpu, vn, i)	\
				 CMP 0.0 ? -1 : 0);			\
	}								\
      return;								\
    }									\
  while (0)

#define VEC_FCMP(CMP)							\
  do									\
    {									\
      if (INSTR (22, 22))						\
	{								\
	  if (! full)							\
	    HALT_NYI;							\
	  for (i = 0; i < 2; i++)					\
	    aarch64_set_vec_u64 (cpu, vd, i,				\
				 aarch64_get_vec_double (cpu, vn, i)	\
				 CMP					\
				 aarch64_get_vec_double (cpu, vm, i)	\
				 ? -1 : 0);				\
	}								\
      else								\
	{								\
	  for (i = 0; i < (full ? 4 : 2); i++)				\
	    aarch64_set_vec_u32 (cpu, vd, i,				\
				 aarch64_get_vec_float (cpu, vn, i)	\
				 CMP					\
				 aarch64_get_vec_float (cpu, vm, i)	\
				 ? -1 : 0);				\
	}								\
      return;								\
    }									\
  while (0)

static void
do_vec_compare (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29]    = part-of-comparison-type
     instr[28,24] = 0 1110
     instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
                    type of float compares: single (-0) / double (-1)
     instr[21]    = 1
     instr[20,16] = Vm or 00000 (compare vs 0)
     instr[15,10] = part-of-comparison-type
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  int full = INSTR (30, 30);
  int size = INSTR (23, 22);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (28, 24, 0x0E);
  NYI_assert (21, 21, 1);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if ((INSTR (11, 11)
       && INSTR (14, 14))
      || ((INSTR (11, 11) == 0
	   && INSTR (10, 10) == 0)))
    {
      /* A compare vs 0.  */
      if (vm != 0)
	{
	  if (INSTR (15, 10) == 0x2A)
	    do_vec_maxv (cpu);
	  else if (INSTR (15, 10) == 0x32
		   || INSTR (15, 10) == 0x3E)
	    do_vec_fminmaxV (cpu);
	  else if (INSTR (29, 23) == 0x1C
		   && INSTR (21, 10) == 0x876)
	    do_vec_SCVTF (cpu);
	  else
	    HALT_NYI;
	  return;
	}
    }

  if (INSTR (14, 14))
    {
      /* A floating point compare.  */
      unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
	| INSTR (13, 10);

      NYI_assert (15, 15, 1);

      switch (decode)
	{
	case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
	case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
	case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
	case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
	case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
	case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
	case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
	case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);

	default:
	  HALT_NYI;
	}
    }
  else
    {
      unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);

      switch (decode)
	{
	case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
	case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
	case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
	case 0x23: /* 0100011 TST */	VEC_CMP  (u, & );
	case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
	case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
	case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
	case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
	case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
	case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
	case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
	default:
	  if (vm == 0)
	    HALT_NYI;
	  do_vec_maxv (cpu);
	}
    }
}

static void
do_vec_SSHL (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = first part (0)/ second part (1)
     instr[29,24] = 00 1110
     instr[23,22] = size: byte(00), half(01), word (10), long (11)
     instr[21]    = 1
     instr[20,16] = Vm
     instr[15,10] = 0100 01
     instr[9,5]   = Vn
     instr[4,0]   = Vd.  */

  unsigned full = INSTR (30, 30);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  signed int shift;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x11);

  /* FIXME: What is a signed shift left in this context ?.  */

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	{
	  shift = aarch64_get_vec_s8 (cpu, vm, i);
	  if (shift >= 0)
	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
				<< shift);
	  else
	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
				>> - shift);
	}
      return;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	{
	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
	  if (shift >= 0)
	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
				 << shift);
	  else
	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
				 >> - shift);
	}
      return;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	{
	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
	  if (shift >= 0)
	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
				 << shift);
	  else
	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
				 >> - shift);
	}
      return;

    case 3:
      if (! full)
	HALT_UNALLOC;
      for (i = 0; i < 2; i++)
	{
	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
	  if (shift >= 0)
	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
				 << shift);
	  else
	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
				 >> - shift);
	}
      return;
    }
}

static void
do_vec_USHL (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = first part (0)/ second part (1)
     instr[29,24] = 10 1110
     instr[23,22] = size: byte(00), half(01), word (10), long (11)
     instr[21]    = 1
     instr[20,16] = Vm
     instr[15,10] = 0100 01
     instr[9,5]   = Vn
     instr[4,0]   = Vd  */

  unsigned full = INSTR (30, 30);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  signed int shift;

  NYI_assert (29, 24, 0x2E);
  NYI_assert (15, 10, 0x11);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
	for (i = 0; i < (full ? 16 : 8); i++)
	  {
	    shift = aarch64_get_vec_s8 (cpu, vm, i);
	    if (shift >= 0)
	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
				  << shift);
	    else
	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
				  >> - shift);
	  }
      return;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	{
	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
	  if (shift >= 0)
	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
				 << shift);
	  else
	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
				 >> - shift);
	}
      return;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	{
	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
	  if (shift >= 0)
	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
				 << shift);
	  else
	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
				 >> - shift);
	}
      return;

    case 3:
      if (! full)
	HALT_UNALLOC;
      for (i = 0; i < 2; i++)
	{
	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
	  if (shift >= 0)
	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
				 << shift);
	  else
	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
				 >> - shift);
	}
      return;
    }
}

static void
do_vec_FMLA (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29,23] = 0011100
     instr[22]    = size: 0=>float, 1=>double
     instr[21]    = 1
     instr[20,16] = Vn
     instr[15,10] = 1100 11
     instr[9,5]   = Vm
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 23, 0x1C);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x33);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      if (! full)
	HALT_UNALLOC;
      for (i = 0; i < 2; i++)
	aarch64_set_vec_double (cpu, vd, i,
				aarch64_get_vec_double (cpu, vn, i) *
				aarch64_get_vec_double (cpu, vm, i) +
				aarch64_get_vec_double (cpu, vd, i));
    }
  else
    {
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_float (cpu, vd, i,
			       aarch64_get_vec_float (cpu, vn, i) *
			       aarch64_get_vec_float (cpu, vm, i) +
			       aarch64_get_vec_float (cpu, vd, i));
    }
}

static void
do_vec_max (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29]    = SMAX (0) / UMAX (1)
     instr[28,24] = 0 1110
     instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
     instr[21]    = 1
     instr[20,16] = Vn
     instr[15,10] = 0110 01
     instr[9,5]   = Vm
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (28, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x19);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (29, 29))
    {
      switch (INSTR (23, 22))
	{
	case 0:
	  for (i = 0; i < (full ? 16 : 8); i++)
	    aarch64_set_vec_u8 (cpu, vd, i,
				aarch64_get_vec_u8 (cpu, vn, i)
				> aarch64_get_vec_u8 (cpu, vm, i)
				? aarch64_get_vec_u8 (cpu, vn, i)
				: aarch64_get_vec_u8 (cpu, vm, i));
	  return;

	case 1:
	  for (i = 0; i < (full ? 8 : 4); i++)
	    aarch64_set_vec_u16 (cpu, vd, i,
				 aarch64_get_vec_u16 (cpu, vn, i)
				 > aarch64_get_vec_u16 (cpu, vm, i)
				 ? aarch64_get_vec_u16 (cpu, vn, i)
				 : aarch64_get_vec_u16 (cpu, vm, i));
	  return;

	case 2:
	  for (i = 0; i < (full ? 4 : 2); i++)
	    aarch64_set_vec_u32 (cpu, vd, i,
				 aarch64_get_vec_u32 (cpu, vn, i)
				 > aarch64_get_vec_u32 (cpu, vm, i)
				 ? aarch64_get_vec_u32 (cpu, vn, i)
				 : aarch64_get_vec_u32 (cpu, vm, i));
	  return;

	case 3:
	  HALT_UNALLOC;
	}
    }
  else
    {
      switch (INSTR (23, 22))
	{
	case 0:
	  for (i = 0; i < (full ? 16 : 8); i++)
	    aarch64_set_vec_s8 (cpu, vd, i,
				aarch64_get_vec_s8 (cpu, vn, i)
				> aarch64_get_vec_s8 (cpu, vm, i)
				? aarch64_get_vec_s8 (cpu, vn, i)
				: aarch64_get_vec_s8 (cpu, vm, i));
	  return;

	case 1:
	  for (i = 0; i < (full ? 8 : 4); i++)
	    aarch64_set_vec_s16 (cpu, vd, i,
				 aarch64_get_vec_s16 (cpu, vn, i)
				 > aarch64_get_vec_s16 (cpu, vm, i)
				 ? aarch64_get_vec_s16 (cpu, vn, i)
				 : aarch64_get_vec_s16 (cpu, vm, i));
	  return;

	case 2:
	  for (i = 0; i < (full ? 4 : 2); i++)
	    aarch64_set_vec_s32 (cpu, vd, i,
				 aarch64_get_vec_s32 (cpu, vn, i)
				 > aarch64_get_vec_s32 (cpu, vm, i)
				 ? aarch64_get_vec_s32 (cpu, vn, i)
				 : aarch64_get_vec_s32 (cpu, vm, i));
	  return;

	case 3:
	  HALT_UNALLOC;
	}
    }
}

static void
do_vec_min (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half selector
     instr[29]    = SMIN (0) / UMIN (1)
     instr[28,24] = 0 1110
     instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
     instr[21]    = 1
     instr[20,16] = Vn
     instr[15,10] = 0110 11
     instr[9,5]   = Vm
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (28, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x1B);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (29, 29))
    {
      switch (INSTR (23, 22))
	{
	case 0:
	  for (i = 0; i < (full ? 16 : 8); i++)
	    aarch64_set_vec_u8 (cpu, vd, i,
				aarch64_get_vec_u8 (cpu, vn, i)
				< aarch64_get_vec_u8 (cpu, vm, i)
				? aarch64_get_vec_u8 (cpu, vn, i)
				: aarch64_get_vec_u8 (cpu, vm, i));
	  return;

	case 1:
	  for (i = 0; i < (full ? 8 : 4); i++)
	    aarch64_set_vec_u16 (cpu, vd, i,
				 aarch64_get_vec_u16 (cpu, vn, i)
				 < aarch64_get_vec_u16 (cpu, vm, i)
				 ? aarch64_get_vec_u16 (cpu, vn, i)
				 : aarch64_get_vec_u16 (cpu, vm, i));
	  return;

	case 2:
	  for (i = 0; i < (full ? 4 : 2); i++)
	    aarch64_set_vec_u32 (cpu, vd, i,
				 aarch64_get_vec_u32 (cpu, vn, i)
				 < aarch64_get_vec_u32 (cpu, vm, i)
				 ? aarch64_get_vec_u32 (cpu, vn, i)
				 : aarch64_get_vec_u32 (cpu, vm, i));
	  return;

	case 3:
	  HALT_UNALLOC;
	}
    }
  else
    {
      switch (INSTR (23, 22))
	{
	case 0:
	  for (i = 0; i < (full ? 16 : 8); i++)
	    aarch64_set_vec_s8 (cpu, vd, i,
				aarch64_get_vec_s8 (cpu, vn, i)
				< aarch64_get_vec_s8 (cpu, vm, i)
				? aarch64_get_vec_s8 (cpu, vn, i)
				: aarch64_get_vec_s8 (cpu, vm, i));
	  return;

	case 1:
	  for (i = 0; i < (full ? 8 : 4); i++)
	    aarch64_set_vec_s16 (cpu, vd, i,
				 aarch64_get_vec_s16 (cpu, vn, i)
				 < aarch64_get_vec_s16 (cpu, vm, i)
				 ? aarch64_get_vec_s16 (cpu, vn, i)
				 : aarch64_get_vec_s16 (cpu, vm, i));
	  return;

	case 2:
	  for (i = 0; i < (full ? 4 : 2); i++)
	    aarch64_set_vec_s32 (cpu, vd, i,
				 aarch64_get_vec_s32 (cpu, vn, i)
				 < aarch64_get_vec_s32 (cpu, vm, i)
				 ? aarch64_get_vec_s32 (cpu, vn, i)
				 : aarch64_get_vec_s32 (cpu, vm, i));
	  return;

	case 3:
	  HALT_UNALLOC;
	}
    }
}

static void
do_vec_sub_long (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = lower (0) / upper (1)
     instr[29]    = signed (0) / unsigned (1)
     instr[28,24] = 0 1110
     instr[23,22] = size: bytes (00), half (01), word (10)
     instr[21]    = 1
     insrt[20,16] = Vm
     instr[15,10] = 0010 00
     instr[9,5]   = Vn
     instr[4,0]   = V dest.  */

  unsigned size = INSTR (23, 22);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned bias = 0;
  unsigned i;

  NYI_assert (28, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x08);

  if (size == 3)
    HALT_UNALLOC;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (30, 29))
    {
    case 2: /* SSUBL2.  */
      bias = 2;
    case 0: /* SSUBL.  */
      switch (size)
	{
	case 0:
	  bias *= 3;
	  for (i = 0; i < 8; i++)
	    aarch64_set_vec_s16 (cpu, vd, i,
				 aarch64_get_vec_s8 (cpu, vn, i + bias)
				 - aarch64_get_vec_s8 (cpu, vm, i + bias));
	  break;

	case 1:
	  bias *= 2;
	  for (i = 0; i < 4; i++)
	    aarch64_set_vec_s32 (cpu, vd, i,
				 aarch64_get_vec_s16 (cpu, vn, i + bias)
				 - aarch64_get_vec_s16 (cpu, vm, i + bias));
	  break;

	case 2:
	  for (i = 0; i < 2; i++)
	    aarch64_set_vec_s64 (cpu, vd, i,
				 aarch64_get_vec_s32 (cpu, vn, i + bias)
				 - aarch64_get_vec_s32 (cpu, vm, i + bias));
	  break;

	default:
	  HALT_UNALLOC;
	}
      break;

    case 3: /* USUBL2.  */
      bias = 2;
    case 1: /* USUBL.  */
      switch (size)
	{
	case 0:
	  bias *= 3;
	  for (i = 0; i < 8; i++)
	    aarch64_set_vec_u16 (cpu, vd, i,
				 aarch64_get_vec_u8 (cpu, vn, i + bias)
				 - aarch64_get_vec_u8 (cpu, vm, i + bias));
	  break;

	case 1:
	  bias *= 2;
	  for (i = 0; i < 4; i++)
	    aarch64_set_vec_u32 (cpu, vd, i,
				 aarch64_get_vec_u16 (cpu, vn, i + bias)
				 - aarch64_get_vec_u16 (cpu, vm, i + bias));
	  break;

	case 2:
	  for (i = 0; i < 2; i++)
	    aarch64_set_vec_u64 (cpu, vd, i,
				 aarch64_get_vec_u32 (cpu, vn, i + bias)
				 - aarch64_get_vec_u32 (cpu, vm, i + bias));
	  break;

	default:
	  HALT_UNALLOC;
	}
      break;
    }
}

static void
do_vec_ADDP (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,24] = 00 1110
     instr[23,22] = size: bytes (00), half (01), word (10), long (11)
     instr[21]    = 1
     insrt[20,16] = Vm
     instr[15,10] = 1011 11
     instr[9,5]   = Vn
     instr[4,0]   = V dest.  */

  FRegister copy_vn;
  FRegister copy_vm;
  unsigned full = INSTR (30, 30);
  unsigned size = INSTR (23, 22);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i, range;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x2F);

  /* Make copies of the source registers in case vd == vn/vm.  */
  copy_vn = cpu->fr[vn];
  copy_vm = cpu->fr[vm];

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (size)
    {
    case 0:
      range = full ? 8 : 4;
      for (i = 0; i < range; i++)
	{
	  aarch64_set_vec_u8 (cpu, vd, i,
			      copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
	  aarch64_set_vec_u8 (cpu, vd, i + range,
			      copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
	}
      return;

    case 1:
      range = full ? 4 : 2;
      for (i = 0; i < range; i++)
	{
	  aarch64_set_vec_u16 (cpu, vd, i,
			       copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
	  aarch64_set_vec_u16 (cpu, vd, i + range,
			       copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
	}
      return;

    case 2:
      range = full ? 2 : 1;
      for (i = 0; i < range; i++)
	{
	  aarch64_set_vec_u32 (cpu, vd, i,
			       copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
	  aarch64_set_vec_u32 (cpu, vd, i + range,
			       copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
	}
      return;

    case 3:
      if (! full)
	HALT_UNALLOC;
      aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
      aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
      return;
    }
}

/* Float point vector convert to longer (precision).  */
static void
do_vec_FCVTL (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0) / all (1)
     instr[29,23] = 00 1110 0
     instr[22]    = single (0) / double (1)
     instr[21,10] = 10 0001 0111 10
     instr[9,5]   = Rn
     instr[4,0]   = Rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  unsigned i;

  NYI_assert (31, 31, 0);
  NYI_assert (29, 23, 0x1C);
  NYI_assert (21, 10, 0x85E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      for (i = 0; i < 2; i++)
	aarch64_set_vec_double (cpu, rd, i,
				aarch64_get_vec_float (cpu, rn, i + 2*full));
    }
  else
    {
      HALT_NYI;

#if 0
      /* TODO: Implement missing half-float support.  */
      for (i = 0; i < 4; i++)
	aarch64_set_vec_float (cpu, rd, i,
			     aarch64_get_vec_halffloat (cpu, rn, i + 4*full));
#endif
    }
}

static void
do_vec_FABS (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,23] = 00 1110 1
     instr[22]    = float(0)/double(1)
     instr[21,16] = 10 0000
     instr[15,10] = 1111 10
     instr[9,5]   = Vn
     instr[4,0]   = Vd.  */

  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  unsigned i;

  NYI_assert (29, 23, 0x1D);
  NYI_assert (21, 10, 0x83E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      if (! full)
	HALT_NYI;

      for (i = 0; i < 2; i++)
	aarch64_set_vec_double (cpu, vd, i,
				fabs (aarch64_get_vec_double (cpu, vn, i)));
    }
  else
    {
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_float (cpu, vd, i,
			       fabsf (aarch64_get_vec_float (cpu, vn, i)));
    }
}

static void
do_vec_FCVTZS (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0) / all (1)
     instr[29,23] = 00 1110 1
     instr[22]    = single (0) / double (1)
     instr[21,10] = 10 0001 1011 10
     instr[9,5]   = Rn
     instr[4,0]   = Rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  unsigned i;

  NYI_assert (31, 31, 0);
  NYI_assert (29, 23, 0x1D);
  NYI_assert (21, 10, 0x86E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      if (! full)
	HALT_UNALLOC;

      for (i = 0; i < 2; i++)
	aarch64_set_vec_s64 (cpu, rd, i,
			     (int64_t) aarch64_get_vec_double (cpu, rn, i));
    }
  else
    for (i = 0; i < (full ? 4 : 2); i++)
      aarch64_set_vec_s32 (cpu, rd, i,
			   (int32_t) aarch64_get_vec_float (cpu, rn, i));
}

static void
do_vec_REV64 (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half
     instr[29,24] = 00 1110
     instr[23,22] = size
     instr[21,10] = 10 0000 0000 10
     instr[9,5]   = Rn
     instr[4,0]   = Rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned size = INSTR (23, 22);
  unsigned full = INSTR (30, 30);
  unsigned i;
  FRegister val;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 10, 0x802);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (size)
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
      break;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
      break;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
      break;

    case 3:
      HALT_UNALLOC;
    }

  aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
  if (full)
    aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
}

static void
do_vec_REV16 (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half
     instr[29,24] = 00 1110
     instr[23,22] = size
     instr[21,10] = 10 0000 0001 10
     instr[9,5]   = Rn
     instr[4,0]   = Rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned size = INSTR (23, 22);
  unsigned full = INSTR (30, 30);
  unsigned i;
  FRegister val;

  NYI_assert (29, 24, 0x0E);
  NYI_assert (21, 10, 0x806);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (size)
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
      break;

    default:
      HALT_UNALLOC;
    }

  aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
  if (full)
    aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
}

static void
do_vec_op1 (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half/full
     instr[29,24] = 00 1110
     instr[23,21] = ???
     instr[20,16] = Vm
     instr[15,10] = sub-opcode
     instr[9,5]   = Vn
     instr[4,0]   = Vd  */
  NYI_assert (29, 24, 0x0E);

  if (INSTR (21, 21) == 0)
    {
      if (INSTR (23, 22) == 0)
	{
	  if (INSTR (30, 30) == 1
	      && INSTR (17, 14) == 0
	      && INSTR (12, 10) == 7)
	    return do_vec_ins_2 (cpu);

	  switch (INSTR (15, 10))
	    {
	    case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
	    case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
	    case 0x07: do_vec_INS (cpu); return;
	    case 0x0B: do_vec_SMOV_into_scalar (cpu); return;
	    case 0x0F: do_vec_UMOV_into_scalar (cpu); return;

	    case 0x00:
	    case 0x08:
	    case 0x10:
	    case 0x18:
	      do_vec_TBL (cpu); return;

	    case 0x06:
	    case 0x16:
	      do_vec_UZP (cpu); return;

	    case 0x0A: do_vec_TRN (cpu); return;

	    case 0x0E:
	    case 0x1E:
	      do_vec_ZIP (cpu); return;

	    default:
	      HALT_NYI;
	    }
	}

      switch (INSTR (13, 10))
	{
	case 0x6: do_vec_UZP (cpu); return;
	case 0xE: do_vec_ZIP (cpu); return;
	case 0xA: do_vec_TRN (cpu); return;
	default:  HALT_NYI;
	}
    }

  switch (INSTR (15, 10))
    {
    case 0x02: do_vec_REV64 (cpu); return;
    case 0x06: do_vec_REV16 (cpu); return;

    case 0x07:
      switch (INSTR (23, 21))
	{
	case 1: do_vec_AND (cpu); return;
	case 3: do_vec_BIC (cpu); return;
	case 5: do_vec_ORR (cpu); return;
	case 7: do_vec_ORN (cpu); return;
	default: HALT_NYI;
	}

    case 0x08: do_vec_sub_long (cpu); return;
    case 0x0a: do_vec_XTN (cpu); return;
    case 0x11: do_vec_SSHL (cpu); return;
    case 0x16: do_vec_CNT (cpu); return;
    case 0x19: do_vec_max (cpu); return;
    case 0x1B: do_vec_min (cpu); return;
    case 0x21: do_vec_add (cpu); return;
    case 0x25: do_vec_MLA (cpu); return;
    case 0x27: do_vec_mul (cpu); return;
    case 0x2F: do_vec_ADDP (cpu); return;
    case 0x30: do_vec_mull (cpu); return;
    case 0x33: do_vec_FMLA (cpu); return;
    case 0x35: do_vec_fadd (cpu); return;

    case 0x1E:
      switch (INSTR (20, 16))
	{
	case 0x01: do_vec_FCVTL (cpu); return;
	default: HALT_NYI;
	}

    case 0x2E:
      switch (INSTR (20, 16))
	{
	case 0x00: do_vec_ABS (cpu); return;
	case 0x01: do_vec_FCVTZS (cpu); return;
	case 0x11: do_vec_ADDV (cpu); return;
	default: HALT_NYI;
	}

    case 0x31:
    case 0x3B:
      do_vec_Fminmax (cpu); return;

    case 0x0D:
    case 0x0F:
    case 0x22:
    case 0x23:
    case 0x26:
    case 0x2A:
    case 0x32:
    case 0x36:
    case 0x39:
    case 0x3A:
      do_vec_compare (cpu); return;

    case 0x3E:
      do_vec_FABS (cpu); return;

    default:
      HALT_NYI;
    }
}

static void
do_vec_xtl (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
     instr[28,22] = 0 1111 00
     instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
     instr[15,10] = 1010 01
     instr[9,5]   = V source
     instr[4,0]   = V dest.  */

  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i, shift, bias = 0;

  NYI_assert (28, 22, 0x3C);
  NYI_assert (15, 10, 0x29);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (30, 29))
    {
    case 2: /* SXTL2, SSHLL2.  */
      bias = 2;
    case 0: /* SXTL, SSHLL.  */
      if (INSTR (21, 21))
	{
	  int64_t val1, val2;

	  shift = INSTR (20, 16);
	  /* Get the source values before setting the destination values
	     in case the source and destination are the same.  */
	  val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
	  val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
	  aarch64_set_vec_s64 (cpu, vd, 0, val1);
	  aarch64_set_vec_s64 (cpu, vd, 1, val2);
	}
      else if (INSTR (20, 20))
	{
	  int32_t v[4];
	  int32_t v1,v2,v3,v4;

	  shift = INSTR (19, 16);
	  bias *= 2;
	  for (i = 0; i < 4; i++)
	    v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
	  for (i = 0; i < 4; i++)
	    aarch64_set_vec_s32 (cpu, vd, i, v[i]);
	}
      else
	{
	  int16_t v[8];
	  NYI_assert (19, 19, 1);

	  shift = INSTR (18, 16);
	  bias *= 4;
	  for (i = 0; i < 8; i++)
	    v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
	  for (i = 0; i < 8; i++)
	    aarch64_set_vec_s16 (cpu, vd, i, v[i]);
	}
      return;

    case 3: /* UXTL2, USHLL2.  */
      bias = 2;
    case 1: /* UXTL, USHLL.  */
      if (INSTR (21, 21))
	{
	  uint64_t v1, v2;
	  shift = INSTR (20, 16);
	  v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
	  v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
	  aarch64_set_vec_u64 (cpu, vd, 0, v1);
	  aarch64_set_vec_u64 (cpu, vd, 1, v2);
	}
      else if (INSTR (20, 20))
	{
	  uint32_t v[4];
	  shift = INSTR (19, 16);
	  bias *= 2;
	  for (i = 0; i < 4; i++)
	    v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
	  for (i = 0; i < 4; i++)
	    aarch64_set_vec_u32 (cpu, vd, i, v[i]);
	}
      else
	{
	  uint16_t v[8];
	  NYI_assert (19, 19, 1);

	  shift = INSTR (18, 16);
	  bias *= 4;
	  for (i = 0; i < 8; i++)
	    v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
	  for (i = 0; i < 8; i++)
	    aarch64_set_vec_u16 (cpu, vd, i, v[i]);
	}
      return;
    }
}

static void
do_vec_SHL (sim_cpu *cpu)
{
  /* instr [31]    = 0
     instr [30]    = half(0)/full(1)
     instr [29,23] = 001 1110
     instr [22,16] = size and shift amount
     instr [15,10] = 01 0101
     instr [9, 5]  = Vs
     instr [4, 0]  = Vd.  */

  int shift;
  int full    = INSTR (30, 30);
  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 23, 0x1E);
  NYI_assert (15, 10, 0x15);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      shift = INSTR (21, 16);

      if (full == 0)
	HALT_UNALLOC;

      for (i = 0; i < 2; i++)
	{
	  uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
	  aarch64_set_vec_u64 (cpu, vd, i, val << shift);
	}

      return;
    }

  if (INSTR (21, 21))
    {
      shift = INSTR (20, 16);

      for (i = 0; i < (full ? 4 : 2); i++)
	{
	  uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
	  aarch64_set_vec_u32 (cpu, vd, i, val << shift);
	}

      return;
    }

  if (INSTR (20, 20))
    {
      shift = INSTR (19, 16);

      for (i = 0; i < (full ? 8 : 4); i++)
	{
	  uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
	  aarch64_set_vec_u16 (cpu, vd, i, val << shift);
	}

      return;
    }

  if (INSTR (19, 19) == 0)
    HALT_UNALLOC;

  shift = INSTR (18, 16);

  for (i = 0; i < (full ? 16 : 8); i++)
    {
      uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
      aarch64_set_vec_u8 (cpu, vd, i, val << shift);
    }
}

static void
do_vec_SSHR_USHR (sim_cpu *cpu)
{
  /* instr [31]    = 0
     instr [30]    = half(0)/full(1)
     instr [29]    = signed(0)/unsigned(1)
     instr [28,23] = 0 1111 0
     instr [22,16] = size and shift amount
     instr [15,10] = 0000 01
     instr [9, 5]  = Vs
     instr [4, 0]  = Vd.  */

  int full       = INSTR (30, 30);
  int sign       = ! INSTR (29, 29);
  unsigned shift = INSTR (22, 16);
  unsigned vs    = INSTR (9, 5);
  unsigned vd    = INSTR (4, 0);
  unsigned i;

  NYI_assert (28, 23, 0x1E);
  NYI_assert (15, 10, 0x01);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      shift = 128 - shift;

      if (full == 0)
	HALT_UNALLOC;

      if (sign)
	for (i = 0; i < 2; i++)
	  {
	    int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
	    aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
	  }
      else
	for (i = 0; i < 2; i++)
	  {
	    uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
	    aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
	  }

      return;
    }

  if (INSTR (21, 21))
    {
      shift = 64 - shift;

      if (sign)
	for (i = 0; i < (full ? 4 : 2); i++)
	  {
	    int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
	    aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
	  }
      else
	for (i = 0; i < (full ? 4 : 2); i++)
	  {
	    uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
	    aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
	  }

      return;
    }

  if (INSTR (20, 20))
    {
      shift = 32 - shift;

      if (sign)
	for (i = 0; i < (full ? 8 : 4); i++)
	  {
	    int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
	    aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
	  }
      else
	for (i = 0; i < (full ? 8 : 4); i++)
	  {
	    uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
	    aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
	  }

      return;
    }

  if (INSTR (19, 19) == 0)
    HALT_UNALLOC;

  shift = 16 - shift;

  if (sign)
    for (i = 0; i < (full ? 16 : 8); i++)
      {
	int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
	aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
      }
  else
    for (i = 0; i < (full ? 16 : 8); i++)
      {
	uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
	aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
      }
}

static void
do_vec_MUL_by_element (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half/full
     instr[29,24] = 00 1111
     instr[23,22] = size
     instr[21]    = L
     instr[20]    = M
     instr[19,16] = m
     instr[15,12] = 1000
     instr[11]    = H
     instr[10]    = 0
     instr[9,5]   = Vn
     instr[4,0]   = Vd  */

  unsigned full     = INSTR (30, 30);
  unsigned L        = INSTR (21, 21);
  unsigned H        = INSTR (11, 11);
  unsigned vn       = INSTR (9, 5);
  unsigned vd       = INSTR (4, 0);
  unsigned size     = INSTR (23, 22);
  unsigned index;
  unsigned vm;
  unsigned e;

  NYI_assert (29, 24, 0x0F);
  NYI_assert (15, 12, 0x8);
  NYI_assert (10, 10, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (size)
    {
    case 1:
      {
	/* 16 bit products.  */
	uint16_t product;
	uint16_t element1;
	uint16_t element2;

	index = (H << 2) | (L << 1) | INSTR (20, 20);
	vm = INSTR (19, 16);
	element2 = aarch64_get_vec_u16 (cpu, vm, index);

	for (e = 0; e < (full ? 8 : 4); e ++)
	  {
	    element1 = aarch64_get_vec_u16 (cpu, vn, e);
	    product  = element1 * element2;
	    aarch64_set_vec_u16 (cpu, vd, e, product);
	  }
      }
      break;

    case 2:
      {
	/* 32 bit products.  */
	uint32_t product;
	uint32_t element1;
	uint32_t element2;

	index = (H << 1) | L;
	vm = INSTR (20, 16);
	element2 = aarch64_get_vec_u32 (cpu, vm, index);

	for (e = 0; e < (full ? 4 : 2); e ++)
	  {
	    element1 = aarch64_get_vec_u32 (cpu, vn, e);
	    product  = element1 * element2;
	    aarch64_set_vec_u32 (cpu, vd, e, product);
	  }
      }
      break;

    default:
      HALT_UNALLOC;
    }
}

static void
do_FMLA_by_element (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half/full
     instr[29,23] = 00 1111 1
     instr[22]    = size
     instr[21]    = L
     instr[20,16] = m
     instr[15,12] = 0001
     instr[11]    = H
     instr[10]    = 0
     instr[9,5]   = Vn
     instr[4,0]   = Vd  */

  unsigned full     = INSTR (30, 30);
  unsigned size     = INSTR (22, 22);
  unsigned L        = INSTR (21, 21);
  unsigned vm       = INSTR (20, 16);
  unsigned H        = INSTR (11, 11);
  unsigned vn       = INSTR (9, 5);
  unsigned vd       = INSTR (4, 0);
  unsigned e;

  NYI_assert (29, 23, 0x1F);
  NYI_assert (15, 12, 0x1);
  NYI_assert (10, 10, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (size)
    {
      double element1, element2;

      if (! full || L)
	HALT_UNALLOC;

      element2 = aarch64_get_vec_double (cpu, vm, H);

      for (e = 0; e < 2; e++)
	{
	  element1 = aarch64_get_vec_double (cpu, vn, e);
	  element1 *= element2;
	  element1 += aarch64_get_vec_double (cpu, vd, e);
	  aarch64_set_vec_double (cpu, vd, e, element1);
	}
    }
  else
    {
      float element1;
      float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);

      for (e = 0; e < (full ? 4 : 2); e++)
	{
	  element1 = aarch64_get_vec_float (cpu, vn, e);
	  element1 *= element2;
	  element1 += aarch64_get_vec_float (cpu, vd, e);
	  aarch64_set_vec_float (cpu, vd, e, element1);
	}
    }
}

static void
do_vec_op2 (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half/full
     instr[29,24] = 00 1111
     instr[23]    = ?
     instr[22,16] = element size & index
     instr[15,10] = sub-opcode
     instr[9,5]   = Vm
     instr[4,0]   = Vd  */

  NYI_assert (29, 24, 0x0F);

  if (INSTR (23, 23) != 0)
    {
      switch (INSTR (15, 10))
	{
	case 0x04:
	case 0x06:
	  do_FMLA_by_element (cpu);
	  return;

	case 0x20:
	case 0x22:
	  do_vec_MUL_by_element (cpu);
	  return;

	default:
	  HALT_NYI;
	}
    }
  else
    {
      switch (INSTR (15, 10))
	{
	case 0x01: do_vec_SSHR_USHR (cpu); return;
	case 0x15: do_vec_SHL (cpu); return;
	case 0x20:
	case 0x22: do_vec_MUL_by_element (cpu); return;
	case 0x29: do_vec_xtl (cpu); return;
	default:   HALT_NYI;
	}
    }
}

static void
do_vec_neg (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full(1)/half(0)
     instr[29,24] = 10 1110
     instr[23,22] = size: byte(00), half (01), word (10), long (11)
     instr[21,10] = 1000 0010 1110
     instr[9,5]   = Vs
     instr[4,0]   = Vd  */

  int    full = INSTR (30, 30);
  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 24, 0x2E);
  NYI_assert (21, 10, 0x82E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
      return;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
      return;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
      return;

    case 3:
      if (! full)
	HALT_NYI;
      for (i = 0; i < 2; i++)
	aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
      return;
    }
}

static void
do_vec_sqrt (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full(1)/half(0)
     instr[29,23] = 101 1101
     instr[22]    = single(0)/double(1)
     instr[21,10] = 1000 0111 1110
     instr[9,5]   = Vs
     instr[4,0]   = Vd.  */

  int    full = INSTR (30, 30);
  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 23, 0x5B);
  NYI_assert (21, 10, 0x87E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22) == 0)
    for (i = 0; i < (full ? 4 : 2); i++)
      aarch64_set_vec_float (cpu, vd, i,
			     sqrtf (aarch64_get_vec_float (cpu, vs, i)));
  else
    for (i = 0; i < 2; i++)
      aarch64_set_vec_double (cpu, vd, i,
			      sqrt (aarch64_get_vec_double (cpu, vs, i)));
}

static void
do_vec_mls_indexed (sim_cpu *cpu)
{
  /* instr[31]       = 0
     instr[30]       = half(0)/full(1)
     instr[29,24]    = 10 1111
     instr[23,22]    = 16-bit(01)/32-bit(10)
     instr[21,20+11] = index (if 16-bit)
     instr[21+11]    = index (if 32-bit)
     instr[20,16]    = Vm
     instr[15,12]    = 0100
     instr[11]       = part of index
     instr[10]       = 0
     instr[9,5]      = Vs
     instr[4,0]      = Vd.  */

  int    full = INSTR (30, 30);
  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned vm = INSTR (20, 16);
  unsigned i;

  NYI_assert (15, 12, 4);
  NYI_assert (10, 10, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 1:
      {
	unsigned elem;
	uint32_t val;

	if (vm > 15)
	  HALT_NYI;

	elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
	val = aarch64_get_vec_u16 (cpu, vm, elem);

	for (i = 0; i < (full ? 8 : 4); i++)
	  aarch64_set_vec_u32 (cpu, vd, i,
			       aarch64_get_vec_u32 (cpu, vd, i) -
			       (aarch64_get_vec_u32 (cpu, vs, i) * val));
	return;
      }

    case 2:
      {
	unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
	uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);

	for (i = 0; i < (full ? 4 : 2); i++)
	  aarch64_set_vec_u64 (cpu, vd, i,
			       aarch64_get_vec_u64 (cpu, vd, i) -
			       (aarch64_get_vec_u64 (cpu, vs, i) * val));
	return;
      }

    case 0:
    case 3:
    default:
      HALT_NYI;
    }
}

static void
do_vec_SUB (sim_cpu *cpu)
{
  /* instr [31]    = 0
     instr [30]    = half(0)/full(1)
     instr [29,24] = 10 1110
     instr [23,22] = size: byte(00, half(01), word (10), long (11)
     instr [21]    = 1
     instr [20,16] = Vm
     instr [15,10] = 10 0001
     instr [9, 5]  = Vn
     instr [4, 0]  = Vd.  */

  unsigned full = INSTR (30, 30);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 24, 0x2E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x21);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_s8 (cpu, vd, i,
			    aarch64_get_vec_s8 (cpu, vn, i)
			    - aarch64_get_vec_s8 (cpu, vm, i));
      return;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_s16 (cpu, vd, i,
			     aarch64_get_vec_s16 (cpu, vn, i)
			     - aarch64_get_vec_s16 (cpu, vm, i));
      return;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_s32 (cpu, vd, i,
			     aarch64_get_vec_s32 (cpu, vn, i)
			     - aarch64_get_vec_s32 (cpu, vm, i));
      return;

    case 3:
      if (full == 0)
	HALT_UNALLOC;

      for (i = 0; i < 2; i++)
	aarch64_set_vec_s64 (cpu, vd, i,
			     aarch64_get_vec_s64 (cpu, vn, i)
			     - aarch64_get_vec_s64 (cpu, vm, i));
      return;
    }
}

static void
do_vec_MLS (sim_cpu *cpu)
{
  /* instr [31]    = 0
     instr [30]    = half(0)/full(1)
     instr [29,24] = 10 1110
     instr [23,22] = size: byte(00, half(01), word (10)
     instr [21]    = 1
     instr [20,16] = Vm
     instr [15,10] = 10 0101
     instr [9, 5]  = Vn
     instr [4, 0]  = Vd.  */

  unsigned full = INSTR (30, 30);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 24, 0x2E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x25);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_u8 (cpu, vd, i,
			    aarch64_get_vec_u8 (cpu, vd, i)
			    - (aarch64_get_vec_u8 (cpu, vn, i)
			       * aarch64_get_vec_u8 (cpu, vm, i)));
      return;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_u16 (cpu, vd, i,
			     aarch64_get_vec_u16 (cpu, vd, i)
			     - (aarch64_get_vec_u16 (cpu, vn, i)
				* aarch64_get_vec_u16 (cpu, vm, i)));
      return;

    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i,
			     aarch64_get_vec_u32 (cpu, vd, i)
			     - (aarch64_get_vec_u32 (cpu, vn, i)
				* aarch64_get_vec_u32 (cpu, vm, i)));
      return;

    default:
      HALT_UNALLOC;
    }
}

static void
do_vec_FDIV (sim_cpu *cpu)
{
  /* instr [31]    = 0
     instr [30]    = half(0)/full(1)
     instr [29,23] = 10 1110 0
     instr [22]    = float()/double(1)
     instr [21]    = 1
     instr [20,16] = Vm
     instr [15,10] = 1111 11
     instr [9, 5]  = Vn
     instr [4, 0]  = Vd.  */

  unsigned full = INSTR (30, 30);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 23, 0x5C);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x3F);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      if (! full)
	HALT_UNALLOC;

      for (i = 0; i < 2; i++)
	aarch64_set_vec_double (cpu, vd, i,
				aarch64_get_vec_double (cpu, vn, i)
				/ aarch64_get_vec_double (cpu, vm, i));
    }
  else
    for (i = 0; i < (full ? 4 : 2); i++)
      aarch64_set_vec_float (cpu, vd, i,
			     aarch64_get_vec_float (cpu, vn, i)
			     / aarch64_get_vec_float (cpu, vm, i));
}

static void
do_vec_FMUL (sim_cpu *cpu)
{
  /* instr [31]    = 0
     instr [30]    = half(0)/full(1)
     instr [29,23] = 10 1110 0
     instr [22]    = float(0)/double(1)
     instr [21]    = 1
     instr [20,16] = Vm
     instr [15,10] = 1101 11
     instr [9, 5]  = Vn
     instr [4, 0]  = Vd.  */

  unsigned full = INSTR (30, 30);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;

  NYI_assert (29, 23, 0x5C);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x37);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      if (! full)
	HALT_UNALLOC;

      for (i = 0; i < 2; i++)
	aarch64_set_vec_double (cpu, vd, i,
				aarch64_get_vec_double (cpu, vn, i)
				* aarch64_get_vec_double (cpu, vm, i));
    }
  else
    for (i = 0; i < (full ? 4 : 2); i++)
      aarch64_set_vec_float (cpu, vd, i,
			     aarch64_get_vec_float (cpu, vn, i)
			     * aarch64_get_vec_float (cpu, vm, i));
}

static void
do_vec_FADDP (sim_cpu *cpu)
{
  /* instr [31]    = 0
     instr [30]    = half(0)/full(1)
     instr [29,23] = 10 1110 0
     instr [22]    = float(0)/double(1)
     instr [21]    = 1
     instr [20,16] = Vm
     instr [15,10] = 1101 01
     instr [9, 5]  = Vn
     instr [4, 0]  = Vd.  */

  unsigned full = INSTR (30, 30);
  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);

  NYI_assert (29, 23, 0x5C);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x35);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      /* Extract values before adding them incase vd == vn/vm.  */
      double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
      double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
      double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
      double tmp4 = aarch64_get_vec_double (cpu, vm, 1);

      if (! full)
	HALT_UNALLOC;

      aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
      aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
    }
  else
    {
      /* Extract values before adding them incase vd == vn/vm.  */
      float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
      float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
      float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
      float tmp6 = aarch64_get_vec_float (cpu, vm, 1);

      if (full)
	{
	  float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
	  float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
	  float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
	  float tmp8 = aarch64_get_vec_float (cpu, vm, 3);

	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
	  aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
	  aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
	  aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
	}
      else
	{
	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
	  aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
	}
    }
}

static void
do_vec_FSQRT (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half(0)/full(1)
     instr[29,23] = 10 1110 1
     instr[22]    = single(0)/double(1)
     instr[21,10] = 10 0001 1111 10
     instr[9,5]   = Vsrc
     instr[4,0]   = Vdest.  */

  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  int i;

  NYI_assert (29, 23, 0x5D);
  NYI_assert (21, 10, 0x87E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      if (! full)
	HALT_UNALLOC;

      for (i = 0; i < 2; i++)
	aarch64_set_vec_double (cpu, vd, i,
				sqrt (aarch64_get_vec_double (cpu, vn, i)));
    }
  else
    {
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_float (cpu, vd, i,
			       sqrtf (aarch64_get_vec_float (cpu, vn, i)));
    }
}

static void
do_vec_FNEG (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,23] = 10 1110 1
     instr[22]    = single (0)/double (1)
     instr[21,10] = 10 0000 1111 10
     instr[9,5]   = Vsrc
     instr[4,0]   = Vdest.  */

  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned full = INSTR (30, 30);
  int i;

  NYI_assert (29, 23, 0x5D);
  NYI_assert (21, 10, 0x83E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      if (! full)
	HALT_UNALLOC;

      for (i = 0; i < 2; i++)
	aarch64_set_vec_double (cpu, vd, i,
				- aarch64_get_vec_double (cpu, vn, i));
    }
  else
    {
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_float (cpu, vd, i,
			       - aarch64_get_vec_float (cpu, vn, i));
    }
}

static void
do_vec_NOT (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,10] = 10 1110 0010 0000 0101 10
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30, 30);

  NYI_assert (29, 10, 0xB8816);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = 0; i < (full ? 16 : 8); i++)
    aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
}

static unsigned int
clz (uint64_t val, unsigned size)
{
  uint64_t mask = 1;
  int      count;

  mask <<= (size - 1);
  count = 0;
  do
    {
      if (val & mask)
	break;
      mask >>= 1;
      count ++;
    }
  while (mask);

  return count;
}

static void
do_vec_CLZ (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = half (0)/full (1)
     instr[29,24] = 10 1110
     instr[23,22] = size
     instr[21,10] = 10 0000 0100 10
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned i;
  int      full = INSTR (30,30);

  NYI_assert (29, 24, 0x2E);
  NYI_assert (21, 10, 0x812);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (23, 22))
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
      break;
    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
      break;
    case 2:
      for (i = 0; i < (full ? 4 : 2); i++)
	aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
      break;
    case 3:
      if (! full)
	HALT_UNALLOC;
      aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
      aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
      break;
    }
}

static void
do_vec_MOV_element (sim_cpu *cpu)
{
  /* instr[31,21] = 0110 1110 000
     instr[20,16] = size & dest index
     instr[15]    = 0
     instr[14,11] = source index
     instr[10]    = 1
     instr[9,5]   = Vs
     instr[4.0]   = Vd.  */

  unsigned vs = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned src_index;
  unsigned dst_index;

  NYI_assert (31, 21, 0x370);
  NYI_assert (15, 15, 0);
  NYI_assert (10, 10, 1);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (16, 16))
    {
      /* Move a byte.  */
      src_index = INSTR (14, 11);
      dst_index = INSTR (20, 17);
      aarch64_set_vec_u8 (cpu, vd, dst_index,
			  aarch64_get_vec_u8 (cpu, vs, src_index));
    }
  else if (INSTR (17, 17))
    {
      /* Move 16-bits.  */
      NYI_assert (11, 11, 0);
      src_index = INSTR (14, 12);
      dst_index = INSTR (20, 18);
      aarch64_set_vec_u16 (cpu, vd, dst_index,
			   aarch64_get_vec_u16 (cpu, vs, src_index));
    }
  else if (INSTR (18, 18))
    {
      /* Move 32-bits.  */
      NYI_assert (12, 11, 0);
      src_index = INSTR (14, 13);
      dst_index = INSTR (20, 19);
      aarch64_set_vec_u32 (cpu, vd, dst_index,
			   aarch64_get_vec_u32 (cpu, vs, src_index));
    }
  else
    {
      NYI_assert (19, 19, 1);
      NYI_assert (13, 11, 0);
      src_index = INSTR (14, 14);
      dst_index = INSTR (20, 20);
      aarch64_set_vec_u64 (cpu, vd, dst_index,
			   aarch64_get_vec_u64 (cpu, vs, src_index));
    }
}

static void
do_vec_REV32 (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half
     instr[29,24] = 10 1110
     instr[23,22] = size
     instr[21,10] = 10 0000 0000 10
     instr[9,5]   = Rn
     instr[4,0]   = Rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned size = INSTR (23, 22);
  unsigned full = INSTR (30, 30);
  unsigned i;
  FRegister val;

  NYI_assert (29, 24, 0x2E);
  NYI_assert (21, 10, 0x802);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (size)
    {
    case 0:
      for (i = 0; i < (full ? 16 : 8); i++)
	val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
      break;

    case 1:
      for (i = 0; i < (full ? 8 : 4); i++)
	val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
      break;

    default:
      HALT_UNALLOC;
    }

  aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
  if (full)
    aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
}

static void
do_vec_EXT (sim_cpu *cpu)
{
  /* instr[31]    = 0
     instr[30]    = full/half
     instr[29,21] = 10 1110 000
     instr[20,16] = Vm
     instr[15]    = 0
     instr[14,11] = source index
     instr[10]    = 0
     instr[9,5]   = Vn
     instr[4.0]   = Vd.  */

  unsigned vm = INSTR (20, 16);
  unsigned vn = INSTR (9, 5);
  unsigned vd = INSTR (4, 0);
  unsigned src_index = INSTR (14, 11);
  unsigned full = INSTR (30, 30);
  unsigned i;
  unsigned j;
  FRegister val;

  NYI_assert (31, 21, 0x370);
  NYI_assert (15, 15, 0);
  NYI_assert (10, 10, 0);

  if (!full && (src_index & 0x8))
    HALT_UNALLOC;

  j = 0;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  for (i = src_index; i < (full ? 16 : 8); i++)
    val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
  for (i = 0; i < src_index; i++)
    val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);

  aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
  if (full)
    aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
}

static void
dexAdvSIMD0 (sim_cpu *cpu)
{
  /* instr [28,25] = 0 111.  */
  if (    INSTR (15, 10) == 0x07
      && (INSTR (9, 5) ==
	  INSTR (20, 16)))
    {
      if (INSTR (31, 21) == 0x075
	  || INSTR (31, 21) == 0x275)
	{
	  do_vec_MOV_whole_vector (cpu);
	  return;
	}
    }

  if (INSTR (29, 19) == 0x1E0)
    {
      do_vec_MOV_immediate (cpu);
      return;
    }

  if (INSTR (29, 19) == 0x5E0)
    {
      do_vec_MVNI (cpu);
      return;
    }

  if (INSTR (29, 19) == 0x1C0
      || INSTR (29, 19) == 0x1C1)
    {
      if (INSTR (15, 10) == 0x03)
	{
	  do_vec_DUP_scalar_into_vector (cpu);
	  return;
	}
    }

  switch (INSTR (29, 24))
    {
    case 0x0E: do_vec_op1 (cpu); return;
    case 0x0F: do_vec_op2 (cpu); return;

    case 0x2E:
      if (INSTR (21, 21) == 1)
	{
	  switch (INSTR (15, 10))
	    {
	    case 0x02:
	      do_vec_REV32 (cpu);
	      return;

	    case 0x07:
	      switch (INSTR (23, 22))
		{
		case 0: do_vec_EOR (cpu); return;
		case 1: do_vec_BSL (cpu); return;
		case 2:
		case 3: do_vec_bit (cpu); return;
		}
	      break;

	    case 0x08: do_vec_sub_long (cpu); return;
	    case 0x11: do_vec_USHL (cpu); return;
	    case 0x12: do_vec_CLZ (cpu); return;
	    case 0x16: do_vec_NOT (cpu); return;
	    case 0x19: do_vec_max (cpu); return;
	    case 0x1B: do_vec_min (cpu); return;
	    case 0x21: do_vec_SUB (cpu); return;
	    case 0x25: do_vec_MLS (cpu); return;
	    case 0x31: do_vec_FminmaxNMP (cpu); return;
	    case 0x35: do_vec_FADDP (cpu); return;
	    case 0x37: do_vec_FMUL (cpu); return;
	    case 0x3F: do_vec_FDIV (cpu); return;

	    case 0x3E:
	      switch (INSTR (20, 16))
		{
		case 0x00: do_vec_FNEG (cpu); return;
		case 0x01: do_vec_FSQRT (cpu); return;
		default:   HALT_NYI;
		}

	    case 0x0D:
	    case 0x0F:
	    case 0x22:
	    case 0x23:
	    case 0x26:
	    case 0x2A:
	    case 0x32:
	    case 0x36:
	    case 0x39:
	    case 0x3A:
	      do_vec_compare (cpu); return;

	    default:
	      break;
	    }
	}

      if (INSTR (31, 21) == 0x370)
	{
	  if (INSTR (10, 10))
	    do_vec_MOV_element (cpu);
	  else
	    do_vec_EXT (cpu);
	  return;
	}

      switch (INSTR (21, 10))
	{
	case 0x82E: do_vec_neg (cpu); return;
	case 0x87E: do_vec_sqrt (cpu); return;
	default:
	  if (INSTR (15, 10) == 0x30)
	    {
	      do_vec_mull (cpu);
	      return;
	    }
	  break;
	}
      break;

    case 0x2f:
      switch (INSTR (15, 10))
	{
	case 0x01: do_vec_SSHR_USHR (cpu); return;
	case 0x10:
	case 0x12: do_vec_mls_indexed (cpu); return;
	case 0x29: do_vec_xtl (cpu); return;
	default:
	  HALT_NYI;
	}

    default:
      break;
    }

  HALT_NYI;
}

/* 3 sources.  */

/* Float multiply add.  */
static void
fmadds (sim_cpu *cpu)
{
  unsigned sa = INSTR (14, 10);
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
			+ aarch64_get_FP_float (cpu, sn)
			* aarch64_get_FP_float (cpu, sm));
}

/* Double multiply add.  */
static void
fmaddd (sim_cpu *cpu)
{
  unsigned sa = INSTR (14, 10);
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
			 + aarch64_get_FP_double (cpu, sn)
			 * aarch64_get_FP_double (cpu, sm));
}

/* Float multiply subtract.  */
static void
fmsubs (sim_cpu *cpu)
{
  unsigned sa = INSTR (14, 10);
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
			- aarch64_get_FP_float (cpu, sn)
			* aarch64_get_FP_float (cpu, sm));
}

/* Double multiply subtract.  */
static void
fmsubd (sim_cpu *cpu)
{
  unsigned sa = INSTR (14, 10);
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
			 - aarch64_get_FP_double (cpu, sn)
			 * aarch64_get_FP_double (cpu, sm));
}

/* Float negative multiply add.  */
static void
fnmadds (sim_cpu *cpu)
{
  unsigned sa = INSTR (14, 10);
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
			+ (- aarch64_get_FP_float (cpu, sn))
			* aarch64_get_FP_float (cpu, sm));
}

/* Double negative multiply add.  */
static void
fnmaddd (sim_cpu *cpu)
{
  unsigned sa = INSTR (14, 10);
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
			 + (- aarch64_get_FP_double (cpu, sn))
			 * aarch64_get_FP_double (cpu, sm));
}

/* Float negative multiply subtract.  */
static void
fnmsubs (sim_cpu *cpu)
{
  unsigned sa = INSTR (14, 10);
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
			+ aarch64_get_FP_float (cpu, sn)
			* aarch64_get_FP_float (cpu, sm));
}

/* Double negative multiply subtract.  */
static void
fnmsubd (sim_cpu *cpu)
{
  unsigned sa = INSTR (14, 10);
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
			 + aarch64_get_FP_double (cpu, sn)
			 * aarch64_get_FP_double (cpu, sm));
}

static void
dexSimpleFPDataProc3Source (sim_cpu *cpu)
{
  /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
     instr[30]    = 0
     instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
     instr[28,25] = 1111
     instr[24]    = 1
     instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
     instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
     instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */

  uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
  /* dispatch on combined type:o1:o2.  */
  uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);

  if (M_S != 0)
    HALT_UNALLOC;

  switch (dispatch)
    {
    case 0: fmadds (cpu); return;
    case 1: fmsubs (cpu); return;
    case 2: fnmadds (cpu); return;
    case 3: fnmsubs (cpu); return;
    case 4: fmaddd (cpu); return;
    case 5: fmsubd (cpu); return;
    case 6: fnmaddd (cpu); return;
    case 7: fnmsubd (cpu); return;
    default:
      /* type > 1 is currently unallocated.  */
      HALT_UNALLOC;
    }
}

static void
dexSimpleFPFixedConvert (sim_cpu *cpu)
{
  HALT_NYI;
}

static void
dexSimpleFPCondCompare (sim_cpu *cpu)
{
  /* instr [31,23] = 0001 1110 0
     instr [22]    = type
     instr [21]    = 1
     instr [20,16] = Rm
     instr [15,12] = condition
     instr [11,10] = 01
     instr [9,5]   = Rn
     instr [4]     = 0
     instr [3,0]   = nzcv  */

  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);

  NYI_assert (31, 23, 0x3C);
  NYI_assert (11, 10, 0x1);
  NYI_assert (4,  4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (! testConditionCode (cpu, INSTR (15, 12)))
    {
      aarch64_set_CPSR (cpu, INSTR (3, 0));
      return;
    }

  if (INSTR (22, 22))
    {
      /* Double precision.  */
      double val1 = aarch64_get_vec_double (cpu, rn, 0);
      double val2 = aarch64_get_vec_double (cpu, rm, 0);

      /* FIXME: Check for NaNs.  */
      if (val1 == val2)
	aarch64_set_CPSR (cpu, (Z | C));
      else if (val1 < val2)
	aarch64_set_CPSR (cpu, N);
      else /* val1 > val2 */
	aarch64_set_CPSR (cpu, C);
    }
  else
    {
      /* Single precision.  */
      float val1 = aarch64_get_vec_float (cpu, rn, 0);
      float val2 = aarch64_get_vec_float (cpu, rm, 0);

      /* FIXME: Check for NaNs.  */
      if (val1 == val2)
	aarch64_set_CPSR (cpu, (Z | C));
      else if (val1 < val2)
	aarch64_set_CPSR (cpu, N);
      else /* val1 > val2 */
	aarch64_set_CPSR (cpu, C);
    }
}

/* 2 sources.  */

/* Float add.  */
static void
fadds (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
			+ aarch64_get_FP_float (cpu, sm));
}

/* Double add.  */
static void
faddd (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
			 + aarch64_get_FP_double (cpu, sm));
}

/* Float divide.  */
static void
fdivs (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
			/ aarch64_get_FP_float (cpu, sm));
}

/* Double divide.  */
static void
fdivd (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
			 / aarch64_get_FP_double (cpu, sm));
}

/* Float multiply.  */
static void
fmuls (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
			* aarch64_get_FP_float (cpu, sm));
}

/* Double multiply.  */
static void
fmuld (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
			 * aarch64_get_FP_double (cpu, sm));
}

/* Float negate and multiply.  */
static void
fnmuls (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
				    * aarch64_get_FP_float (cpu, sm)));
}

/* Double negate and multiply.  */
static void
fnmuld (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
				     * aarch64_get_FP_double (cpu, sm)));
}

/* Float subtract.  */
static void
fsubs (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
			- aarch64_get_FP_float (cpu, sm));
}

/* Double subtract.  */
static void
fsubd (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
			 - aarch64_get_FP_double (cpu, sm));
}

static void
do_FMINNM (sim_cpu *cpu)
{
  /* instr[31,23] = 0 0011 1100
     instr[22]    = float(0)/double(1)
     instr[21]    = 1
     instr[20,16] = Sm
     instr[15,10] = 01 1110
     instr[9,5]   = Sn
     instr[4,0]   = Cpu  */

  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  NYI_assert (31, 23, 0x03C);
  NYI_assert (15, 10, 0x1E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    aarch64_set_FP_double (cpu, sd,
			   dminnm (aarch64_get_FP_double (cpu, sn),
				   aarch64_get_FP_double (cpu, sm)));
  else
    aarch64_set_FP_float (cpu, sd,
			  fminnm (aarch64_get_FP_float (cpu, sn),
				  aarch64_get_FP_float (cpu, sm)));
}

static void
do_FMAXNM (sim_cpu *cpu)
{
  /* instr[31,23] = 0 0011 1100
     instr[22]    = float(0)/double(1)
     instr[21]    = 1
     instr[20,16] = Sm
     instr[15,10] = 01 1010
     instr[9,5]   = Sn
     instr[4,0]   = Cpu  */

  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);
  unsigned sd = INSTR ( 4,  0);

  NYI_assert (31, 23, 0x03C);
  NYI_assert (15, 10, 0x1A);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    aarch64_set_FP_double (cpu, sd,
			   dmaxnm (aarch64_get_FP_double (cpu, sn),
				   aarch64_get_FP_double (cpu, sm)));
  else
    aarch64_set_FP_float (cpu, sd,
			  fmaxnm (aarch64_get_FP_float (cpu, sn),
				  aarch64_get_FP_float (cpu, sm)));
}

static void
dexSimpleFPDataProc2Source (sim_cpu *cpu)
{
  /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
     instr[30]    = 0
     instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
     instr[28,25] = 1111
     instr[24]    = 0
     instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
     instr[21]    = 1
     instr[20,16] = Vm
     instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
                               0010 ==> FADD, 0011 ==> FSUB,
                               0100 ==> FMAX, 0101 ==> FMIN
                               0110 ==> FMAXNM, 0111 ==> FMINNM
                               1000 ==> FNMUL, ow ==> UNALLOC
     instr[11,10] = 10
     instr[9,5]   = Vn
     instr[4,0]   = Vd  */

  uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
  uint32_t type = INSTR (23, 22);
  /* Dispatch on opcode.  */
  uint32_t dispatch = INSTR (15, 12);

  if (type > 1)
    HALT_UNALLOC;

  if (M_S != 0)
    HALT_UNALLOC;

  if (type)
    switch (dispatch)
      {
      case 0: fmuld (cpu); return;
      case 1: fdivd (cpu); return;
      case 2: faddd (cpu); return;
      case 3: fsubd (cpu); return;
      case 6: do_FMAXNM (cpu); return;
      case 7: do_FMINNM (cpu); return;
      case 8: fnmuld (cpu); return;

	/* Have not yet implemented fmax and fmin.  */
      case 4:
      case 5:
	HALT_NYI;

      default:
	HALT_UNALLOC;
      }
  else /* type == 0 => floats.  */
    switch (dispatch)
      {
      case 0: fmuls (cpu); return;
      case 1: fdivs (cpu); return;
      case 2: fadds (cpu); return;
      case 3: fsubs (cpu); return;
      case 6: do_FMAXNM (cpu); return;
      case 7: do_FMINNM (cpu); return;
      case 8: fnmuls (cpu); return;

      case 4:
      case 5:
	HALT_NYI;

      default:
	HALT_UNALLOC;
      }
}

static void
dexSimpleFPCondSelect (sim_cpu *cpu)
{
  /* FCSEL
     instr[31,23] = 0 0011 1100
     instr[22]    = 0=>single 1=>double
     instr[21]    = 1
     instr[20,16] = Sm
     instr[15,12] = cond
     instr[11,10] = 11
     instr[9,5]   = Sn
     instr[4,0]   = Cpu  */
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9, 5);
  unsigned sd = INSTR ( 4, 0);
  uint32_t set = testConditionCode (cpu, INSTR (15, 12));

  NYI_assert (31, 23, 0x03C);
  NYI_assert (11, 10, 0x3);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
				     : aarch64_get_FP_double (cpu, sm)));
  else
    aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
				    : aarch64_get_FP_float (cpu, sm)));
}

/* Store 32 bit unscaled signed 9 bit.  */
static void
fsturs (sim_cpu *cpu, int32_t offset)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
		       aarch64_get_vec_u32 (cpu, st, 0));
}

/* Store 64 bit unscaled signed 9 bit.  */
static void
fsturd (sim_cpu *cpu, int32_t offset)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
		       aarch64_get_vec_u64 (cpu, st, 0));
}

/* Store 128 bit unscaled signed 9 bit.  */
static void
fsturq (sim_cpu *cpu, int32_t offset)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);
  FRegister a;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_get_FP_long_double (cpu, st, & a);
  aarch64_set_mem_long_double (cpu,
			       aarch64_get_reg_u64 (cpu, rn, 1)
			       + offset, a);
}

/* TODO FP move register.  */

/* 32 bit fp to fp move register.  */
static void
ffmovs (sim_cpu *cpu)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
}

/* 64 bit fp to fp move register.  */
static void
ffmovd (sim_cpu *cpu)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
}

/* 32 bit GReg to Vec move register.  */
static void
fgmovs (sim_cpu *cpu)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
}

/* 64 bit g to fp move register.  */
static void
fgmovd (sim_cpu *cpu)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
}

/* 32 bit fp to g move register.  */
static void
gfmovs (sim_cpu *cpu)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
}

/* 64 bit fp to g move register.  */
static void
gfmovd (sim_cpu *cpu)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
}

/* FP move immediate

   These install an immediate 8 bit value in the target register
   where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
   bit exponent.  */

static void
fmovs (sim_cpu *cpu)
{
  unsigned int sd = INSTR (4, 0);
  uint32_t imm = INSTR (20, 13);
  float f = fp_immediate_for_encoding_32 (imm);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, f);
}

static void
fmovd (sim_cpu *cpu)
{
  unsigned int sd = INSTR (4, 0);
  uint32_t imm = INSTR (20, 13);
  double d = fp_immediate_for_encoding_64 (imm);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, d);
}

static void
dexSimpleFPImmediate (sim_cpu *cpu)
{
  /* instr[31,23] == 00111100
     instr[22]    == type : single(0)/double(1)
     instr[21]    == 1
     instr[20,13] == imm8
     instr[12,10] == 100
     instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
     instr[4,0]   == Rd  */
  uint32_t imm5 = INSTR (9, 5);

  NYI_assert (31, 23, 0x3C);

  if (imm5 != 0)
    HALT_UNALLOC;

  if (INSTR (22, 22))
    fmovd (cpu);
  else
    fmovs (cpu);
}

/* TODO specific decode and execute for group Load Store.  */

/* TODO FP load/store single register (unscaled offset).  */

/* TODO load 8 bit unscaled signed 9 bit.  */
/* TODO load 16 bit unscaled signed 9 bit.  */

/* Load 32 bit unscaled signed 9 bit.  */
static void
fldurs (sim_cpu *cpu, int32_t offset)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
}

/* Load 64 bit unscaled signed 9 bit.  */
static void
fldurd (sim_cpu *cpu, int32_t offset)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
}

/* Load 128 bit unscaled signed 9 bit.  */
static void
fldurq (sim_cpu *cpu, int32_t offset)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int st = INSTR (4, 0);
  FRegister a;
  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_get_mem_long_double (cpu, addr, & a);
  aarch64_set_FP_long_double (cpu, st, a);
}

/* TODO store 8 bit unscaled signed 9 bit.  */
/* TODO store 16 bit unscaled signed 9 bit.  */


/* 1 source.  */

/* Float absolute value.  */
static void
fabss (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);
  float value = aarch64_get_FP_float (cpu, sn);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, fabsf (value));
}

/* Double absolute value.  */
static void
fabcpu (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);
  double value = aarch64_get_FP_double (cpu, sn);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, fabs (value));
}

/* Float negative value.  */
static void
fnegs (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
}

/* Double negative value.  */
static void
fnegd (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
}

/* Float square root.  */
static void
fsqrts (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
}

/* Double square root.  */
static void
fsqrtd (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd,
			 sqrt (aarch64_get_FP_double (cpu, sn)));
}

/* Convert double to float.  */
static void
fcvtds (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
}

/* Convert float to double.  */
static void
fcvtcpu (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
}

static void
do_FRINT (sim_cpu *cpu)
{
  /* instr[31,23] = 0001 1110 0
     instr[22]    = single(0)/double(1)
     instr[21,18] = 1001
     instr[17,15] = rounding mode
     instr[14,10] = 10000
     instr[9,5]   = source
     instr[4,0]   = dest  */

  float val;
  unsigned rs = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned int rmode = INSTR (17, 15);

  NYI_assert (31, 23, 0x03C);
  NYI_assert (21, 18, 0x9);
  NYI_assert (14, 10, 0x10);

  if (rmode == 6 || rmode == 7)
    /* FIXME: Add support for rmode == 6 exactness check.  */
    rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      double val = aarch64_get_FP_double (cpu, rs);

      switch (rmode)
	{
	case 0: /* mode N: nearest or even.  */
	  {
	    double rval = round (val);

	    if (val - rval == 0.5)
	      {
		if (((rval / 2.0) * 2.0) != rval)
		  rval += 1.0;
	      }

	    aarch64_set_FP_double (cpu, rd, round (val));
	    return;
	  }

	case 1: /* mode P: towards +inf.  */
	  if (val < 0.0)
	    aarch64_set_FP_double (cpu, rd, trunc (val));
	  else
	    aarch64_set_FP_double (cpu, rd, round (val));
	  return;

	case 2: /* mode M: towards -inf.  */
	  if (val < 0.0)
	    aarch64_set_FP_double (cpu, rd, round (val));
	  else
	    aarch64_set_FP_double (cpu, rd, trunc (val));
	  return;

	case 3: /* mode Z: towards 0.  */
	  aarch64_set_FP_double (cpu, rd, trunc (val));
	  return;

	case 4: /* mode A: away from 0.  */
	  aarch64_set_FP_double (cpu, rd, round (val));
	  return;

	case 6: /* mode X: use FPCR with exactness check.  */
	case 7: /* mode I: use FPCR mode.  */
	  HALT_NYI;

	default:
	  HALT_UNALLOC;
	}
    }

  val = aarch64_get_FP_float (cpu, rs);

  switch (rmode)
    {
    case 0: /* mode N: nearest or even.  */
      {
	float rval = roundf (val);

	if (val - rval == 0.5)
	  {
	    if (((rval / 2.0) * 2.0) != rval)
	      rval += 1.0;
	  }

	aarch64_set_FP_float (cpu, rd, rval);
	return;
      }

    case 1: /* mode P: towards +inf.  */
      if (val < 0.0)
	aarch64_set_FP_float (cpu, rd, truncf (val));
      else
	aarch64_set_FP_float (cpu, rd, roundf (val));
      return;

    case 2: /* mode M: towards -inf.  */
      if (val < 0.0)
	aarch64_set_FP_float (cpu, rd, truncf (val));
      else
	aarch64_set_FP_float (cpu, rd, roundf (val));
      return;

    case 3: /* mode Z: towards 0.  */
      aarch64_set_FP_float (cpu, rd, truncf (val));
      return;

    case 4: /* mode A: away from 0.  */
      aarch64_set_FP_float (cpu, rd, roundf (val));
      return;

    case 6: /* mode X: use FPCR with exactness check.  */
    case 7: /* mode I: use FPCR mode.  */
      HALT_NYI;

    default:
      HALT_UNALLOC;
    }
}

/* Convert half to float.  */
static void
do_FCVT_half_to_single (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 10, 0x7B890);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
}

/* Convert half to double.  */
static void
do_FCVT_half_to_double (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 10, 0x7B8B0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
}

static void
do_FCVT_single_to_half (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 10, 0x788F0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
}

/* Convert double to half.  */
static void
do_FCVT_double_to_half (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 10, 0x798F0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
}

static void
dexSimpleFPDataProc1Source (sim_cpu *cpu)
{
  /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
     instr[30]    = 0
     instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
     instr[28,25] = 1111
     instr[24]    = 0
     instr[23,22] ==> type : 00 ==> source is single,
                             01 ==> source is double
                             10 ==> UNALLOC
                             11 ==> UNALLOC or source is half
     instr[21]    = 1
     instr[20,15] ==> opcode : with type 00 or 01
                               000000 ==> FMOV, 000001 ==> FABS,
                               000010 ==> FNEG, 000011 ==> FSQRT,
                               000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
                               000110 ==> UNALLOC, 000111 ==> FCVT (to half)
                               001000 ==> FRINTN, 001001 ==> FRINTP,
                               001010 ==> FRINTM, 001011 ==> FRINTZ,
                               001100 ==> FRINTA, 001101 ==> UNALLOC
                               001110 ==> FRINTX, 001111 ==> FRINTI
                               with type 11
                               000100 ==> FCVT (half-to-single)
                               000101 ==> FCVT (half-to-double)
			       instr[14,10] = 10000.  */

  uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
  uint32_t type   = INSTR (23, 22);
  uint32_t opcode = INSTR (20, 15);

  if (M_S != 0)
    HALT_UNALLOC;

  if (type == 3)
    {
      if (opcode == 4)
	do_FCVT_half_to_single (cpu);
      else if (opcode == 5)
	do_FCVT_half_to_double (cpu);
      else
	HALT_UNALLOC;
      return;
    }

  if (type == 2)
    HALT_UNALLOC;

  switch (opcode)
    {
    case 0:
      if (type)
	ffmovd (cpu);
      else
	ffmovs (cpu);
      return;

    case 1:
      if (type)
	fabcpu (cpu);
      else
	fabss (cpu);
      return;

    case 2:
      if (type)
	fnegd (cpu);
      else
	fnegs (cpu);
      return;

    case 3:
      if (type)
	fsqrtd (cpu);
      else
	fsqrts (cpu);
      return;

    case 4:
      if (type)
	fcvtds (cpu);
      else
	HALT_UNALLOC;
      return;

    case 5:
      if (type)
	HALT_UNALLOC;
      fcvtcpu (cpu);
      return;

    case 8:		/* FRINTN etc.  */
    case 9:
    case 10:
    case 11:
    case 12:
    case 14:
    case 15:
       do_FRINT (cpu);
       return;

    case 7:
      if (INSTR (22, 22))
	do_FCVT_double_to_half (cpu);
      else
	do_FCVT_single_to_half (cpu);
      return;

    case 13:
      HALT_NYI;

    default:
      HALT_UNALLOC;
    }
}

/* 32 bit signed int to float.  */
static void
scvtf32 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float
    (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
}

/* signed int to float.  */
static void
scvtf (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_float
    (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
}

/* 32 bit signed int to double.  */
static void
scvtd32 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double
    (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
}

/* signed int to double.  */
static void
scvtd (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned sd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_FP_double
    (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
}

static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
static const double DOUBLE_INT_MAX  = (double) INT_MAX;
static const double DOUBLE_INT_MIN  = (double) INT_MIN;
static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
static const double DOUBLE_LONG_MIN = (double) LONG_MIN;

#define UINT_MIN 0
#define ULONG_MIN 0
static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;

/* Check for FP exception conditions:
     NaN raises IO
     Infinity raises IO
     Out of Range raises IO and IX and saturates value
     Denormal raises ID and IX and sets to zero.  */
#define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)	\
  do							\
    {							\
      switch (fpclassify (F))				\
	{						\
	case FP_INFINITE:				\
	case FP_NAN:					\
	  aarch64_set_FPSR (cpu, IO);			\
	  if (signbit (F))				\
	    VALUE = ITYPE##_MAX;			\
	  else						\
	    VALUE = ITYPE##_MIN;			\
	  break;					\
							\
	case FP_NORMAL:					\
	  if (F >= FTYPE##_##ITYPE##_MAX)		\
	    {						\
	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
	      VALUE = ITYPE##_MAX;			\
	    }						\
	  else if (F <= FTYPE##_##ITYPE##_MIN)		\
	    {						\
	      aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);	\
	      VALUE = ITYPE##_MIN;			\
	    }						\
	  break;					\
							\
	case FP_SUBNORMAL:				\
	  aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);	\
	  VALUE = 0;					\
	  break;					\
							\
	default:					\
	case FP_ZERO:					\
	  VALUE = 0;					\
	  break;					\
	}						\
    }							\
  while (0)

/* 32 bit convert float to signed int truncate towards zero.  */
static void
fcvtszs32 (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  /* TODO : check that this rounds toward zero.  */
  float   f = aarch64_get_FP_float (cpu, sn);
  int32_t value = (int32_t) f;

  RAISE_EXCEPTIONS (f, value, FLOAT, INT);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* Avoid sign extension to 64 bit.  */
  aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
}

/* 64 bit convert float to signed int truncate towards zero.  */
static void
fcvtszs (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  float f = aarch64_get_FP_float (cpu, sn);
  int64_t value = (int64_t) f;

  RAISE_EXCEPTIONS (f, value, FLOAT, LONG);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
}

/* 32 bit convert double to signed int truncate towards zero.  */
static void
fcvtszd32 (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  /* TODO : check that this rounds toward zero.  */
  double   d = aarch64_get_FP_double (cpu, sn);
  int32_t  value = (int32_t) d;

  RAISE_EXCEPTIONS (d, value, DOUBLE, INT);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* Avoid sign extension to 64 bit.  */
  aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
}

/* 64 bit convert double to signed int truncate towards zero.  */
static void
fcvtszd (sim_cpu *cpu)
{
  unsigned sn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  /* TODO : check that this rounds toward zero.  */
  double  d = aarch64_get_FP_double (cpu, sn);
  int64_t value;

  value = (int64_t) d;

  RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
}

static void
do_fcvtzu (sim_cpu *cpu)
{
  /* instr[31]    = size: 32-bit (0), 64-bit (1)
     instr[30,23] = 00111100
     instr[22]    = type: single (0)/ double (1)
     instr[21]    = enable (0)/disable(1) precision
     instr[20,16] = 11001
     instr[15,10] = precision
     instr[9,5]   = Rs
     instr[4,0]   = Rd.  */

  unsigned rs = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (30, 23, 0x3C);
  NYI_assert (20, 16, 0x19);

  if (INSTR (21, 21) != 1)
    /* Convert to fixed point.  */
    HALT_NYI;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (31, 31))
    {
      /* Convert to unsigned 64-bit integer.  */
      if (INSTR (22, 22))
	{
	  double  d = aarch64_get_FP_double (cpu, rs);
	  uint64_t value = (uint64_t) d;

	  /* Do not raise an exception if we have reached ULONG_MAX.  */
	  if (value != (1UL << 63))
	    RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);

	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
	}
      else
	{
	  float  f = aarch64_get_FP_float (cpu, rs);
	  uint64_t value = (uint64_t) f;

	  /* Do not raise an exception if we have reached ULONG_MAX.  */
	  if (value != (1UL << 63))
	    RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);

	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
	}
    }
  else
    {
      uint32_t value;

      /* Convert to unsigned 32-bit integer.  */
      if (INSTR (22, 22))
	{
	  double  d = aarch64_get_FP_double (cpu, rs);

	  value = (uint32_t) d;
	  /* Do not raise an exception if we have reached UINT_MAX.  */
	  if (value != (1UL << 31))
	    RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
	}
      else
	{
	  float  f = aarch64_get_FP_float (cpu, rs);

	  value = (uint32_t) f;
	  /* Do not raise an exception if we have reached UINT_MAX.  */
	  if (value != (1UL << 31))
	    RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
	}

      aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
    }
}

static void
do_UCVTF (sim_cpu *cpu)
{
  /* instr[31]    = size: 32-bit (0), 64-bit (1)
     instr[30,23] = 001 1110 0
     instr[22]    = type: single (0)/ double (1)
     instr[21]    = enable (0)/disable(1) precision
     instr[20,16] = 0 0011
     instr[15,10] = precision
     instr[9,5]   = Rs
     instr[4,0]   = Rd.  */

  unsigned rs = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (30, 23, 0x3C);
  NYI_assert (20, 16, 0x03);

  if (INSTR (21, 21) != 1)
    HALT_NYI;

  /* FIXME: Add exception raising.  */
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (31, 31))
    {
      uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);

      if (INSTR (22, 22))
	aarch64_set_FP_double (cpu, rd, (double) value);
      else
	aarch64_set_FP_float (cpu, rd, (float) value);
    }
  else
    {
      uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);

      if (INSTR (22, 22))
	aarch64_set_FP_double (cpu, rd, (double) value);
      else
	aarch64_set_FP_float (cpu, rd, (float) value);
    }
}

static void
float_vector_move (sim_cpu *cpu)
{
  /* instr[31,17] == 100 1111 0101 0111
     instr[16]    ==> direction 0=> to GR, 1=> from GR
     instr[15,10] => ???
     instr[9,5]   ==> source
     instr[4,0]   ==> dest.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 17, 0x4F57);

  if (INSTR (15, 10) != 0)
    HALT_UNALLOC;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (16, 16))
    aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
  else
    aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
}

static void
dexSimpleFPIntegerConvert (sim_cpu *cpu)
{
  /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30     = 0
     instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
     instr[28,25] = 1111
     instr[24]    = 0
     instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
     instr[21]    = 1
     instr[20,19] = rmode
     instr[18,16] = opcode
     instr[15,10] = 10 0000  */

  uint32_t rmode_opcode;
  uint32_t size_type;
  uint32_t type;
  uint32_t size;
  uint32_t S;

  if (INSTR (31, 17) == 0x4F57)
    {
      float_vector_move (cpu);
      return;
    }

  size = INSTR (31, 31);
  S = INSTR (29, 29);
  if (S != 0)
    HALT_UNALLOC;

  type = INSTR (23, 22);
  if (type > 1)
    HALT_UNALLOC;

  rmode_opcode = INSTR (20, 16);
  size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */

  switch (rmode_opcode)
    {
    case 2:			/* SCVTF.  */
      switch (size_type)
	{
	case 0: scvtf32 (cpu); return;
	case 1: scvtd32 (cpu); return;
	case 2: scvtf (cpu); return;
	case 3: scvtd (cpu); return;
	}

    case 6:			/* FMOV GR, Vec.  */
      switch (size_type)
	{
	case 0:  gfmovs (cpu); return;
	case 3:  gfmovd (cpu); return;
	default: HALT_UNALLOC;
	}

    case 7:			/* FMOV vec, GR.  */
      switch (size_type)
	{
	case 0:  fgmovs (cpu); return;
	case 3:  fgmovd (cpu); return;
	default: HALT_UNALLOC;
	}

    case 24:			/* FCVTZS.  */
      switch (size_type)
	{
	case 0: fcvtszs32 (cpu); return;
	case 1: fcvtszd32 (cpu); return;
	case 2: fcvtszs (cpu); return;
	case 3: fcvtszd (cpu); return;
	}

    case 25: do_fcvtzu (cpu); return;
    case 3:  do_UCVTF (cpu); return;

    case 0:	/* FCVTNS.  */
    case 1:	/* FCVTNU.  */
    case 4:	/* FCVTAS.  */
    case 5:	/* FCVTAU.  */
    case 8:	/* FCVPTS.  */
    case 9:	/* FCVTPU.  */
    case 16:	/* FCVTMS.  */
    case 17:	/* FCVTMU.  */
    default:
      HALT_NYI;
    }
}

static void
set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
{
  uint32_t flags;

  /* FIXME: Add exception raising.  */
  if (isnan (fvalue1) || isnan (fvalue2))
    flags = C|V;
  else if (isinf (fvalue1) && isinf (fvalue2))
    {
      /* Subtracting two infinities may give a NaN.  We only need to compare
	 the signs, which we can get from isinf.  */
      int result = isinf (fvalue1) - isinf (fvalue2);

      if (result == 0)
	flags = Z|C;
      else if (result < 0)
	flags = N;
      else /* (result > 0).  */
	flags = C;
    }
  else
    {
      float result = fvalue1 - fvalue2;

      if (result == 0.0)
	flags = Z|C;
      else if (result < 0)
	flags = N;
      else /* (result > 0).  */
	flags = C;
    }

  aarch64_set_CPSR (cpu, flags);
}

static void
fcmps (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);

  float fvalue1 = aarch64_get_FP_float (cpu, sn);
  float fvalue2 = aarch64_get_FP_float (cpu, sm);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  set_flags_for_float_compare (cpu, fvalue1, fvalue2);
}

/* Float compare to zero -- Invalid Operation exception
   only on signaling NaNs.  */
static void
fcmpzs (sim_cpu *cpu)
{
  unsigned sn = INSTR ( 9,  5);
  float fvalue1 = aarch64_get_FP_float (cpu, sn);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  set_flags_for_float_compare (cpu, fvalue1, 0.0f);
}

/* Float compare -- Invalid Operation exception on all NaNs.  */
static void
fcmpes (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);

  float fvalue1 = aarch64_get_FP_float (cpu, sn);
  float fvalue2 = aarch64_get_FP_float (cpu, sm);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  set_flags_for_float_compare (cpu, fvalue1, fvalue2);
}

/* Float compare to zero -- Invalid Operation exception on all NaNs.  */
static void
fcmpzes (sim_cpu *cpu)
{
  unsigned sn = INSTR ( 9,  5);
  float fvalue1 = aarch64_get_FP_float (cpu, sn);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  set_flags_for_float_compare (cpu, fvalue1, 0.0f);
}

static void
set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
{
  uint32_t flags;

  /* FIXME: Add exception raising.  */
  if (isnan (dval1) || isnan (dval2))
    flags = C|V;
  else if (isinf (dval1) && isinf (dval2))
    {
      /* Subtracting two infinities may give a NaN.  We only need to compare
	 the signs, which we can get from isinf.  */
      int result = isinf (dval1) - isinf (dval2);

      if (result == 0)
	flags = Z|C;
      else if (result < 0)
	flags = N;
      else /* (result > 0).  */
	flags = C;
    }
  else
    {
      double result = dval1 - dval2;

      if (result == 0.0)
	flags = Z|C;
      else if (result < 0)
	flags = N;
      else /* (result > 0).  */
	flags = C;
    }

  aarch64_set_CPSR (cpu, flags);
}

/* Double compare -- Invalid Operation exception only on signaling NaNs.  */
static void
fcmpd (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);

  double dvalue1 = aarch64_get_FP_double (cpu, sn);
  double dvalue2 = aarch64_get_FP_double (cpu, sm);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  set_flags_for_double_compare (cpu, dvalue1, dvalue2);
}

/* Double compare to zero -- Invalid Operation exception
   only on signaling NaNs.  */
static void
fcmpzd (sim_cpu *cpu)
{
  unsigned sn = INSTR ( 9,  5);
  double dvalue1 = aarch64_get_FP_double (cpu, sn);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  set_flags_for_double_compare (cpu, dvalue1, 0.0);
}

/* Double compare -- Invalid Operation exception on all NaNs.  */
static void
fcmped (sim_cpu *cpu)
{
  unsigned sm = INSTR (20, 16);
  unsigned sn = INSTR ( 9,  5);

  double dvalue1 = aarch64_get_FP_double (cpu, sn);
  double dvalue2 = aarch64_get_FP_double (cpu, sm);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  set_flags_for_double_compare (cpu, dvalue1, dvalue2);
}

/* Double compare to zero -- Invalid Operation exception on all NaNs.  */
static void
fcmpzed (sim_cpu *cpu)
{
  unsigned sn = INSTR ( 9,  5);
  double dvalue1 = aarch64_get_FP_double (cpu, sn);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  set_flags_for_double_compare (cpu, dvalue1, 0.0);
}

static void
dexSimpleFPCompare (sim_cpu *cpu)
{
  /* assert instr[28,25] == 1111
     instr[30:24:21:13,10] = 0011000
     instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
     instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
     instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
     instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
     instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
                              01000 ==> FCMPZ, 11000 ==> FCMPEZ,
                              ow ==> UNALLOC  */
  uint32_t dispatch;
  uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
  uint32_t type = INSTR (23, 22);
  uint32_t op = INSTR (15, 14);
  uint32_t op2_2_0 = INSTR (2, 0);

  if (op2_2_0 != 0)
    HALT_UNALLOC;

  if (M_S != 0)
    HALT_UNALLOC;

  if (type > 1)
    HALT_UNALLOC;

  if (op != 0)
    HALT_UNALLOC;

  /* dispatch on type and top 2 bits of opcode.  */
  dispatch = (type << 2) | INSTR (4, 3);

  switch (dispatch)
    {
    case 0: fcmps (cpu); return;
    case 1: fcmpzs (cpu); return;
    case 2: fcmpes (cpu); return;
    case 3: fcmpzes (cpu); return;
    case 4: fcmpd (cpu); return;
    case 5: fcmpzd (cpu); return;
    case 6: fcmped (cpu); return;
    case 7: fcmpzed (cpu); return;
    }
}

static void
do_scalar_FADDP (sim_cpu *cpu)
{
  /* instr [31,23] = 0111 1110 0
     instr [22]    = single(0)/double(1)
     instr [21,10] = 11 0000 1101 10
     instr [9,5]   = Fn
     instr [4,0]   = Fd.  */

  unsigned Fn = INSTR (9, 5);
  unsigned Fd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0FC);
  NYI_assert (21, 10, 0xC36);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      double val1 = aarch64_get_vec_double (cpu, Fn, 0);
      double val2 = aarch64_get_vec_double (cpu, Fn, 1);

      aarch64_set_FP_double (cpu, Fd, val1 + val2);
    }
  else
    {
      float val1 = aarch64_get_vec_float (cpu, Fn, 0);
      float val2 = aarch64_get_vec_float (cpu, Fn, 1);

      aarch64_set_FP_float (cpu, Fd, val1 + val2);
    }
}

/* Floating point absolute difference.  */

static void
do_scalar_FABD (sim_cpu *cpu)
{
  /* instr [31,23] = 0111 1110 1
     instr [22]    = float(0)/double(1)
     instr [21]    = 1
     instr [20,16] = Rm
     instr [15,10] = 1101 01
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0FD);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 10, 0x35);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    aarch64_set_FP_double (cpu, rd,
			   fabs (aarch64_get_FP_double (cpu, rn)
				 - aarch64_get_FP_double (cpu, rm)));
  else
    aarch64_set_FP_float (cpu, rd,
			  fabsf (aarch64_get_FP_float (cpu, rn)
				 - aarch64_get_FP_float (cpu, rm)));
}

static void
do_scalar_CMGT (sim_cpu *cpu)
{
  /* instr [31,21] = 0101 1110 111
     instr [20,16] = Rm
     instr [15,10] = 00 1101
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 21, 0x2F7);
  NYI_assert (15, 10, 0x0D);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, rd, 0,
		       aarch64_get_vec_u64 (cpu, rn, 0) >
		       aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
}

static void
do_scalar_USHR (sim_cpu *cpu)
{
  /* instr [31,23] = 0111 1111 0
     instr [22,16] = shift amount
     instr [15,10] = 0000 01
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned amount = 128 - INSTR (22, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0FE);
  NYI_assert (15, 10, 0x01);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, rd, 0,
		       aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
}

static void
do_scalar_SSHL (sim_cpu *cpu)
{
  /* instr [31,21] = 0101 1110 111
     instr [20,16] = Rm
     instr [15,10] = 0100 01
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);

  NYI_assert (31, 21, 0x2F7);
  NYI_assert (15, 10, 0x11);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (shift >= 0)
    aarch64_set_vec_s64 (cpu, rd, 0,
			 aarch64_get_vec_s64 (cpu, rn, 0) << shift);
  else
    aarch64_set_vec_s64 (cpu, rd, 0,
			 aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
}

/* Floating point scalar compare greater than or equal to 0.  */
static void
do_scalar_FCMGE_zero (sim_cpu *cpu)
{
  /* instr [31,23] = 0111 1110 1
     instr [22,22] = size
     instr [21,16] = 1000 00
     instr [15,10] = 1100 10
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned size = INSTR (22, 22);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0FD);
  NYI_assert (21, 16, 0x20);
  NYI_assert (15, 10, 0x32);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (size)
    aarch64_set_vec_u64 (cpu, rd, 0,
			 aarch64_get_vec_double (cpu, rn, 0) >= 0.0 ? -1 : 0);
  else
    aarch64_set_vec_u32 (cpu, rd, 0,
			 aarch64_get_vec_float (cpu, rn, 0) >= 0.0 ? -1 : 0);
}

/* Floating point scalar compare less than or equal to 0.  */
static void
do_scalar_FCMLE_zero (sim_cpu *cpu)
{
  /* instr [31,23] = 0111 1110 1
     instr [22,22] = size
     instr [21,16] = 1000 00
     instr [15,10] = 1101 10
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned size = INSTR (22, 22);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0FD);
  NYI_assert (21, 16, 0x20);
  NYI_assert (15, 10, 0x36);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (size)
    aarch64_set_vec_u64 (cpu, rd, 0,
			 aarch64_get_vec_double (cpu, rn, 0) <= 0.0 ? -1 : 0);
  else
    aarch64_set_vec_u32 (cpu, rd, 0,
			 aarch64_get_vec_float (cpu, rn, 0) <= 0.0 ? -1 : 0);
}

/* Floating point scalar compare greater than 0.  */
static void
do_scalar_FCMGT_zero (sim_cpu *cpu)
{
  /* instr [31,23] = 0101 1110 1
     instr [22,22] = size
     instr [21,16] = 1000 00
     instr [15,10] = 1100 10
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned size = INSTR (22, 22);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0BD);
  NYI_assert (21, 16, 0x20);
  NYI_assert (15, 10, 0x32);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (size)
    aarch64_set_vec_u64 (cpu, rd, 0,
			 aarch64_get_vec_double (cpu, rn, 0) > 0.0 ? -1 : 0);
  else
    aarch64_set_vec_u32 (cpu, rd, 0,
			 aarch64_get_vec_float (cpu, rn, 0) > 0.0 ? -1 : 0);
}

/* Floating point scalar compare equal to 0.  */
static void
do_scalar_FCMEQ_zero (sim_cpu *cpu)
{
  /* instr [31,23] = 0101 1110 1
     instr [22,22] = size
     instr [21,16] = 1000 00
     instr [15,10] = 1101 10
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned size = INSTR (22, 22);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0BD);
  NYI_assert (21, 16, 0x20);
  NYI_assert (15, 10, 0x36);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (size)
    aarch64_set_vec_u64 (cpu, rd, 0,
			 aarch64_get_vec_double (cpu, rn, 0) == 0.0 ? -1 : 0);
  else
    aarch64_set_vec_u32 (cpu, rd, 0,
			 aarch64_get_vec_float (cpu, rn, 0) == 0.0 ? -1 : 0);
}

/* Floating point scalar compare less than 0.  */
static void
do_scalar_FCMLT_zero (sim_cpu *cpu)
{
  /* instr [31,23] = 0101 1110 1
     instr [22,22] = size
     instr [21,16] = 1000 00
     instr [15,10] = 1110 10
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned size = INSTR (22, 22);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0BD);
  NYI_assert (21, 16, 0x20);
  NYI_assert (15, 10, 0x3A);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (size)
    aarch64_set_vec_u64 (cpu, rd, 0,
			 aarch64_get_vec_double (cpu, rn, 0) < 0.0 ? -1 : 0);
  else
    aarch64_set_vec_u32 (cpu, rd, 0,
			 aarch64_get_vec_float (cpu, rn, 0) < 0.0 ? -1 : 0);
}

static void
do_scalar_shift (sim_cpu *cpu)
{
  /* instr [31,23] = 0101 1111 0
     instr [22,16] = shift amount
     instr [15,10] = 0101 01   [SHL]
     instr [15,10] = 0000 01   [SSHR]
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned amount;

  NYI_assert (31, 23, 0x0BE);

  if (INSTR (22, 22) == 0)
    HALT_UNALLOC;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  switch (INSTR (15, 10))
    {
    case 0x01: /* SSHR */
      amount = 128 - INSTR (22, 16);
      aarch64_set_vec_s64 (cpu, rd, 0,
			   aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
      return;
    case 0x15: /* SHL */
      amount = INSTR (22, 16) - 64;
      aarch64_set_vec_u64 (cpu, rd, 0,
			   aarch64_get_vec_u64 (cpu, rn, 0) << amount);
      return;
    default:
      HALT_NYI;
    }
}

/* FCMEQ FCMGT FCMGE.  */
static void
do_scalar_FCM (sim_cpu *cpu)
{
  /* instr [31,30] = 01
     instr [29]    = U
     instr [28,24] = 1 1110
     instr [23]    = E
     instr [22]    = size
     instr [21]    = 1
     instr [20,16] = Rm
     instr [15,12] = 1110
     instr [11]    = AC
     instr [10]    = 1
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
  unsigned result;
  float val1;
  float val2;

  NYI_assert (31, 30, 1);
  NYI_assert (28, 24, 0x1E);
  NYI_assert (21, 21, 1);
  NYI_assert (15, 12, 0xE);
  NYI_assert (10, 10, 1);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      double val1 = aarch64_get_FP_double (cpu, rn);
      double val2 = aarch64_get_FP_double (cpu, rm);

      switch (EUac)
	{
	case 0: /* 000 */
	  result = val1 == val2;
	  break;

	case 3: /* 011 */
	  val1 = fabs (val1);
	  val2 = fabs (val2);
	  /* Fall through. */
	case 2: /* 010 */
	  result = val1 >= val2;
	  break;

	case 7: /* 111 */
	  val1 = fabs (val1);
	  val2 = fabs (val2);
	  /* Fall through. */
	case 6: /* 110 */
	  result = val1 > val2;
	  break;

	default:
	  HALT_UNALLOC;
	}

      aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
      return;
    }

  val1 = aarch64_get_FP_float (cpu, rn);
  val2 = aarch64_get_FP_float (cpu, rm);

  switch (EUac)
    {
    case 0: /* 000 */
      result = val1 == val2;
      break;

    case 3: /* 011 */
      val1 = fabsf (val1);
      val2 = fabsf (val2);
      /* Fall through. */
    case 2: /* 010 */
      result = val1 >= val2;
      break;

    case 7: /* 111 */
      val1 = fabsf (val1);
      val2 = fabsf (val2);
      /* Fall through. */
    case 6: /* 110 */
      result = val1 > val2;
      break;

    default:
      HALT_UNALLOC;
    }

  aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
}

/* An alias of DUP.  */
static void
do_scalar_MOV (sim_cpu *cpu)
{
  /* instr [31,21] = 0101 1110 000
     instr [20,16] = imm5
     instr [15,10] = 0000 01
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  unsigned index;

  NYI_assert (31, 21, 0x2F0);
  NYI_assert (15, 10, 0x01);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (16, 16))
    {
      /* 8-bit.  */
      index = INSTR (20, 17);
      aarch64_set_vec_u8
	(cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
    }
  else if (INSTR (17, 17))
    {
      /* 16-bit.  */
      index = INSTR (20, 18);
      aarch64_set_vec_u16
	(cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
    }
  else if (INSTR (18, 18))
    {
      /* 32-bit.  */
      index = INSTR (20, 19);
      aarch64_set_vec_u32
	(cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
    }
  else if (INSTR (19, 19))
    {
      /* 64-bit.  */
      index = INSTR (20, 20);
      aarch64_set_vec_u64
	(cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
    }
  else
    HALT_UNALLOC;
}

static void
do_scalar_NEG (sim_cpu *cpu)
{
  /* instr [31,10] = 0111 1110 1110 0000 1011 10
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 10, 0x1FB82E);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
}

static void
do_scalar_USHL (sim_cpu *cpu)
{
  /* instr [31,21] = 0111 1110 111
     instr [20,16] = Rm
     instr [15,10] = 0100 01
     instr [9, 5]  = Rn
     instr [4, 0]  = Rd.  */

  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);

  NYI_assert (31, 21, 0x3F7);
  NYI_assert (15, 10, 0x11);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (shift >= 0)
    aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
  else
    aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
}

static void
do_double_add (sim_cpu *cpu)
{
  /* instr [31,21] = 0101 1110 111
     instr [20,16] = Fn
     instr [15,10] = 1000 01
     instr [9,5]   = Fm
     instr [4,0]   = Fd.  */
  unsigned Fd;
  unsigned Fm;
  unsigned Fn;
  double val1;
  double val2;

  NYI_assert (31, 21, 0x2F7);
  NYI_assert (15, 10, 0x21);

  Fd = INSTR (4, 0);
  Fm = INSTR (9, 5);
  Fn = INSTR (20, 16);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  val1 = aarch64_get_FP_double (cpu, Fm);
  val2 = aarch64_get_FP_double (cpu, Fn);

  aarch64_set_FP_double (cpu, Fd, val1 + val2);
}

static void
do_scalar_UCVTF (sim_cpu *cpu)
{
  /* instr [31,23] = 0111 1110 0
     instr [22]    = single(0)/double(1)
     instr [21,10] = 10 0001 1101 10
     instr [9,5]   = rn
     instr [4,0]   = rd.  */

  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  NYI_assert (31, 23, 0x0FC);
  NYI_assert (21, 10, 0x876);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (INSTR (22, 22))
    {
      uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);

      aarch64_set_vec_double (cpu, rd, 0, (double) val);
    }
  else
    {
      uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);

      aarch64_set_vec_float (cpu, rd, 0, (float) val);
    }
}

static void
do_scalar_vec (sim_cpu *cpu)
{
  /* instr [30] = 1.  */
  /* instr [28,25] = 1111.  */
  switch (INSTR (31, 23))
    {
    case 0xBC:
      switch (INSTR (15, 10))
	{
	case 0x01: do_scalar_MOV (cpu); return;
	case 0x39: do_scalar_FCM (cpu); return;
	case 0x3B: do_scalar_FCM (cpu); return;
	}
      break;

    case 0xBE: do_scalar_shift (cpu); return;

    case 0xFC:
      switch (INSTR (15, 10))
	{
	case 0x36:
	  switch (INSTR (21, 16))
	    {
	    case 0x30: do_scalar_FADDP (cpu); return;
	    case 0x21: do_scalar_UCVTF (cpu); return;
	    }
	  HALT_NYI;
	case 0x39: do_scalar_FCM (cpu); return;
	case 0x3B: do_scalar_FCM (cpu); return;
	}
      break;

    case 0xFD:
      switch (INSTR (15, 10))
	{
	case 0x0D: do_scalar_CMGT (cpu); return;
	case 0x11: do_scalar_USHL (cpu); return;
	case 0x2E: do_scalar_NEG (cpu); return;
	case 0x32: do_scalar_FCMGE_zero (cpu); return;
	case 0x35: do_scalar_FABD (cpu); return;
	case 0x36: do_scalar_FCMLE_zero (cpu); return;
	case 0x39: do_scalar_FCM (cpu); return;
	case 0x3B: do_scalar_FCM (cpu); return;
	default:
	  HALT_NYI;
	}

    case 0xFE: do_scalar_USHR (cpu); return;

    case 0xBD:
      switch (INSTR (15, 10))
	{
	case 0x21: do_double_add (cpu); return;
	case 0x11: do_scalar_SSHL (cpu); return;
	case 0x32: do_scalar_FCMGT_zero (cpu); return;
	case 0x36: do_scalar_FCMEQ_zero (cpu); return;
	case 0x3A: do_scalar_FCMLT_zero (cpu); return;
	default:
	  HALT_NYI;
	}

    default:
      HALT_NYI;
    }
}

static void
dexAdvSIMD1 (sim_cpu *cpu)
{
  /* instr [28,25] = 1 111.  */

  /* We are currently only interested in the basic
     scalar fp routines which all have bit 30 = 0.  */
  if (INSTR (30, 30))
    do_scalar_vec (cpu);

  /* instr[24] is set for FP data processing 3-source and clear for
     all other basic scalar fp instruction groups.  */
  else if (INSTR (24, 24))
    dexSimpleFPDataProc3Source (cpu);

  /* instr[21] is clear for floating <-> fixed conversions and set for
     all other basic scalar fp instruction groups.  */
  else if (!INSTR (21, 21))
    dexSimpleFPFixedConvert (cpu);

  /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
     11 ==> cond select,  00 ==> other.  */
  else
    switch (INSTR (11, 10))
      {
      case 1: dexSimpleFPCondCompare (cpu); return;
      case 2: dexSimpleFPDataProc2Source (cpu); return;
      case 3: dexSimpleFPCondSelect (cpu); return;

      default:
	/* Now an ordered cascade of tests.
	   FP immediate has instr [12] == 1.
	   FP compare has   instr [13] == 1.
	   FP Data Proc 1 Source has instr [14] == 1.
	   FP floating <--> integer conversions has instr [15] == 0.  */
	if (INSTR (12, 12))
	  dexSimpleFPImmediate (cpu);

	else if (INSTR (13, 13))
	  dexSimpleFPCompare (cpu);

	else if (INSTR (14, 14))
	  dexSimpleFPDataProc1Source (cpu);

	else if (!INSTR (15, 15))
	  dexSimpleFPIntegerConvert (cpu);

	else
	  /* If we get here then instr[15] == 1 which means UNALLOC.  */
	  HALT_UNALLOC;
      }
}

/* PC relative addressing.  */

static void
pcadr (sim_cpu *cpu)
{
  /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
     instr[30,29] = immlo
     instr[23,5] = immhi.  */
  uint64_t address;
  unsigned rd = INSTR (4, 0);
  uint32_t isPage = INSTR (31, 31);
  union { int64_t u64; uint64_t s64; } imm;
  uint64_t offset;

  imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
  offset = imm.u64;
  offset = (offset << 2) | INSTR (30, 29);

  address = aarch64_get_PC (cpu);

  if (isPage)
    {
      offset <<= 12;
      address &= ~0xfff;
    }

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
}

/* Specific decode and execute for group Data Processing Immediate.  */

static void
dexPCRelAddressing (sim_cpu *cpu)
{
  /* assert instr[28,24] = 10000.  */
  pcadr (cpu);
}

/* Immediate logical.
   The bimm32/64 argument is constructed by replicating a 2, 4, 8,
   16, 32 or 64 bit sequence pulled out at decode and possibly
   inverting it..

   N.B. the output register (dest) can normally be Xn or SP
   the exception occurs for flag setting instructions which may
   only use Xn for the output (dest).  The input register can
   never be SP.  */

/* 32 bit and immediate.  */
static void
and32 (sim_cpu *cpu, uint32_t bimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
}

/* 64 bit and immediate.  */
static void
and64 (sim_cpu *cpu, uint64_t bimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
}

/* 32 bit and immediate set flags.  */
static void
ands32 (sim_cpu *cpu, uint32_t bimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t value2 = bimm;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
  set_flags_for_binop32 (cpu, value1 & value2);
}

/* 64 bit and immediate set flags.  */
static void
ands64 (sim_cpu *cpu, uint64_t bimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t value2 = bimm;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
  set_flags_for_binop64 (cpu, value1 & value2);
}

/* 32 bit exclusive or immediate.  */
static void
eor32 (sim_cpu *cpu, uint32_t bimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
}

/* 64 bit exclusive or immediate.  */
static void
eor64 (sim_cpu *cpu, uint64_t bimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
}

/* 32 bit or immediate.  */
static void
orr32 (sim_cpu *cpu, uint32_t bimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
}

/* 64 bit or immediate.  */
static void
orr64 (sim_cpu *cpu, uint64_t bimm)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, SP_OK,
		       aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
}

/* Logical shifted register.
   These allow an optional LSL, ASR, LSR or ROR to the second source
   register with a count up to the register bit count.
   N.B register args may not be SP.  */

/* 32 bit AND shifted register.  */
static void
and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
     & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
}

/* 64 bit AND shifted register.  */
static void
and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
     & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
}

/* 32 bit AND shifted register setting flags.  */
static void
ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			       shift, count);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
  set_flags_for_binop32 (cpu, value1 & value2);
}

/* 64 bit AND shifted register setting flags.  */
static void
ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
			       shift, count);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
  set_flags_for_binop64 (cpu, value1 & value2);
}

/* 32 bit BIC shifted register.  */
static void
bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
     & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
}

/* 64 bit BIC shifted register.  */
static void
bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
     & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
}

/* 32 bit BIC shifted register setting flags.  */
static void
bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
				 shift, count);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
  set_flags_for_binop32 (cpu, value1 & value2);
}

/* 64 bit BIC shifted register setting flags.  */
static void
bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
				 shift, count);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
  set_flags_for_binop64 (cpu, value1 & value2);
}

/* 32 bit EON shifted register.  */
static void
eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
     ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
}

/* 64 bit EON shifted register.  */
static void
eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
     ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
}

/* 32 bit EOR shifted register.  */
static void
eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
     ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
}

/* 64 bit EOR shifted register.  */
static void
eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
     ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
}

/* 32 bit ORR shifted register.  */
static void
orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
     | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
}

/* 64 bit ORR shifted register.  */
static void
orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
     | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
}

/* 32 bit ORN shifted register.  */
static void
orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
     | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
}

/* 64 bit ORN shifted register.  */
static void
orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
     | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
}

static void
dexLogicalImmediate (sim_cpu *cpu)
{
  /* assert instr[28,23] = 1001000
     instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
     instr[22] = N : used to construct immediate mask
     instr[21,16] = immr
     instr[15,10] = imms
     instr[9,5] = Rn
     instr[4,0] = Rd  */

  /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
  uint32_t size = INSTR (31, 31);
  uint32_t N = INSTR (22, 22);
  /* uint32_t immr = INSTR (21, 16);.  */
  /* uint32_t imms = INSTR (15, 10);.  */
  uint32_t index = INSTR (22, 10);
  uint64_t bimm64 = LITable [index];
  uint32_t dispatch = INSTR (30, 29);

  if (~size & N)
    HALT_UNALLOC;

  if (!bimm64)
    HALT_UNALLOC;

  if (size == 0)
    {
      uint32_t bimm = (uint32_t) bimm64;

      switch (dispatch)
	{
	case 0: and32 (cpu, bimm); return;
	case 1: orr32 (cpu, bimm); return;
	case 2: eor32 (cpu, bimm); return;
	case 3: ands32 (cpu, bimm); return;
	}
    }
  else
    {
      switch (dispatch)
	{
	case 0: and64 (cpu, bimm64); return;
	case 1: orr64 (cpu, bimm64); return;
	case 2: eor64 (cpu, bimm64); return;
	case 3: ands64 (cpu, bimm64); return;
	}
    }
  HALT_UNALLOC;
}

/* Immediate move.
   The uimm argument is a 16 bit value to be inserted into the
   target register the pos argument locates the 16 bit word in the
   dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
   3} for 64 bit.
   N.B register arg may not be SP so it should be.
   accessed using the setGZRegisterXXX accessors.  */

/* 32 bit move 16 bit immediate zero remaining shorts.  */
static void
movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
{
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
}

/* 64 bit move 16 bit immediate zero remaining shorts.  */
static void
movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
{
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
}

/* 32 bit move 16 bit immediate negated.  */
static void
movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
{
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
}

/* 64 bit move 16 bit immediate negated.  */
static void
movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
{
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
		      ^ 0xffffffffffffffffULL));
}

/* 32 bit move 16 bit immediate keep remaining shorts.  */
static void
movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
{
  unsigned rd = INSTR (4, 0);
  uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
  uint32_t value = val << (pos * 16);
  uint32_t mask = ~(0xffffU << (pos * 16));

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
}

/* 64 bit move 16 it immediate keep remaining shorts.  */
static void
movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
{
  unsigned rd = INSTR (4, 0);
  uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
  uint64_t value = (uint64_t) val << (pos * 16);
  uint64_t mask = ~(0xffffULL << (pos * 16));

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
}

static void
dexMoveWideImmediate (sim_cpu *cpu)
{
  /* assert instr[28:23] = 100101
     instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
     instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
     instr[20,5] = uimm16
     instr[4,0] = Rd  */

  /* N.B. the (multiple of 16) shift is applied by the called routine,
     we just pass the multiplier.  */

  uint32_t imm;
  uint32_t size = INSTR (31, 31);
  uint32_t op = INSTR (30, 29);
  uint32_t shift = INSTR (22, 21);

  /* 32 bit can only shift 0 or 1 lot of 16.
     anything else is an unallocated instruction.  */
  if (size == 0 && (shift > 1))
    HALT_UNALLOC;

  if (op == 1)
    HALT_UNALLOC;

  imm = INSTR (20, 5);

  if (size == 0)
    {
      if (op == 0)
	movn32 (cpu, imm, shift);
      else if (op == 2)
	movz32 (cpu, imm, shift);
      else
	movk32 (cpu, imm, shift);
    }
  else
    {
      if (op == 0)
	movn64 (cpu, imm, shift);
      else if (op == 2)
	movz64 (cpu, imm, shift);
      else
	movk64 (cpu, imm, shift);
    }
}

/* Bitfield operations.
   These take a pair of bit positions r and s which are in {0..31}
   or {0..63} depending on the instruction word size.
   N.B register args may not be SP.  */

/* OK, we start with ubfm which just needs to pick
   some bits out of source zero the rest and write
   the result to dest.  Just need two logical shifts.  */

/* 32 bit bitfield move, left and right of affected zeroed
   if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
static void
ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
{
  unsigned rd;
  unsigned rn = INSTR (9, 5);
  uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);

  /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
  if (r <= s)
    {
      /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
         We want only bits s:xxx:r at the bottom of the word
         so we LSL bit s up to bit 31 i.e. by 31 - s
         and then we LSR to bring bit 31 down to bit s - r
	 i.e. by 31 + r - s.  */
      value <<= 31 - s;
      value >>= 31 + r - s;
    }
  else
    {
      /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
         We want only bits s:xxx:0 starting at it 31-(r-1)
         so we LSL bit s up to bit 31 i.e. by 31 - s
         and then we LSL to bring bit 31 down to 31-(r-1)+s
	 i.e. by r - (s + 1).  */
      value <<= 31 - s;
      value >>= r - (s + 1);
    }

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  rd = INSTR (4, 0);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
}

/* 64 bit bitfield move, left and right of affected zeroed
   if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
static void
ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
{
  unsigned rd;
  unsigned rn = INSTR (9, 5);
  uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);

  if (r <= s)
    {
      /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
         We want only bits s:xxx:r at the bottom of the word.
         So we LSL bit s up to bit 63 i.e. by 63 - s
         and then we LSR to bring bit 63 down to bit s - r
	 i.e. by 63 + r - s.  */
      value <<= 63 - s;
      value >>= 63 + r - s;
    }
  else
    {
      /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
         We want only bits s:xxx:0 starting at it 63-(r-1).
         So we LSL bit s up to bit 63 i.e. by 63 - s
         and then we LSL to bring bit 63 down to 63-(r-1)+s
	 i.e. by r - (s + 1).  */
      value <<= 63 - s;
      value >>= r - (s + 1);
    }

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  rd = INSTR (4, 0);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
}

/* The signed versions need to insert sign bits
   on the left of the inserted bit field. so we do
   much the same as the unsigned version except we
   use an arithmetic shift right -- this just means
   we need to operate on signed values.  */

/* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
/* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
static void
sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
{
  unsigned rd;
  unsigned rn = INSTR (9, 5);
  /* as per ubfm32 but use an ASR instead of an LSR.  */
  int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);

  if (r <= s)
    {
      value <<= 31 - s;
      value >>= 31 + r - s;
    }
  else
    {
      value <<= 31 - s;
      value >>= r - (s + 1);
    }

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  rd = INSTR (4, 0);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
}

/* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
/* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
static void
sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
{
  unsigned rd;
  unsigned rn = INSTR (9, 5);
  /* acpu per ubfm but use an ASR instead of an LSR.  */
  int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);

  if (r <= s)
    {
      value <<= 63 - s;
      value >>= 63 + r - s;
    }
  else
    {
      value <<= 63 - s;
      value >>= r - (s + 1);
    }

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  rd = INSTR (4, 0);
  aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
}

/* Finally, these versions leave non-affected bits
   as is. so we need to generate the bits as per
   ubfm and also generate a mask to pick the
   bits from the original and computed values.  */

/* 32 bit bitfield move, non-affected bits left as is.
   If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
static void
bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
{
  unsigned rn = INSTR (9, 5);
  uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t mask = -1;
  unsigned rd;
  uint32_t value2;

  /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
  if (r <= s)
    {
      /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
         We want only bits s:xxx:r at the bottom of the word
         so we LSL bit s up to bit 31 i.e. by 31 - s
         and then we LSR to bring bit 31 down to bit s - r
	 i.e. by 31 + r - s.  */
      value <<= 31 - s;
      value >>= 31 + r - s;
      /* the mask must include the same bits.  */
      mask <<= 31 - s;
      mask >>= 31 + r - s;
    }
  else
    {
      /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
         We want only bits s:xxx:0 starting at it 31-(r-1)
         so we LSL bit s up to bit 31 i.e. by 31 - s
         and then we LSL to bring bit 31 down to 31-(r-1)+s
	 i.e. by r - (s + 1).  */
      value <<= 31 - s;
      value >>= r - (s + 1);
      /* The mask must include the same bits.  */
      mask <<= 31 - s;
      mask >>= r - (s + 1);
    }

  rd = INSTR (4, 0);
  value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);

  value2 &= ~mask;
  value2 |= value;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
}

/* 64 bit bitfield move, non-affected bits left as is.
   If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
static void
bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
{
  unsigned rd;
  unsigned rn = INSTR (9, 5);
  uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t mask = 0xffffffffffffffffULL;

  if (r <= s)
    {
      /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
         We want only bits s:xxx:r at the bottom of the word
         so we LSL bit s up to bit 63 i.e. by 63 - s
         and then we LSR to bring bit 63 down to bit s - r
	 i.e. by 63 + r - s.  */
      value <<= 63 - s;
      value >>= 63 + r - s;
      /* The mask must include the same bits.  */
      mask <<= 63 - s;
      mask >>= 63 + r - s;
    }
  else
    {
      /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
         We want only bits s:xxx:0 starting at it 63-(r-1)
         so we LSL bit s up to bit 63 i.e. by 63 - s
         and then we LSL to bring bit 63 down to 63-(r-1)+s
	 i.e. by r - (s + 1).  */
      value <<= 63 - s;
      value >>= r - (s + 1);
      /* The mask must include the same bits.  */
      mask <<= 63 - s;
      mask >>= r - (s + 1);
    }

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  rd = INSTR (4, 0);
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
}

static void
dexBitfieldImmediate (sim_cpu *cpu)
{
  /* assert instr[28:23] = 100110
     instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
     instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
     instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
     instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
     instr[9,5] = Rn
     instr[4,0] = Rd  */

  /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
  uint32_t dispatch;
  uint32_t imms;
  uint32_t size = INSTR (31, 31);
  uint32_t N = INSTR (22, 22);
  /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
  /* or else we have an UNALLOC.  */
  uint32_t immr = INSTR (21, 16);

  if (~size & N)
    HALT_UNALLOC;

  if (!size && uimm (immr, 5, 5))
    HALT_UNALLOC;

  imms = INSTR (15, 10);
  if (!size && uimm (imms, 5, 5))
    HALT_UNALLOC;

  /* Switch on combined size and op.  */
  dispatch = INSTR (31, 29);
  switch (dispatch)
    {
    case 0: sbfm32 (cpu, immr, imms); return;
    case 1: bfm32 (cpu, immr, imms); return;
    case 2: ubfm32 (cpu, immr, imms); return;
    case 4: sbfm (cpu, immr, imms); return;
    case 5: bfm (cpu, immr, imms); return;
    case 6: ubfm (cpu, immr, imms); return;
    default: HALT_UNALLOC;
    }
}

static void
do_EXTR_32 (sim_cpu *cpu)
{
  /* instr[31:21] = 00010011100
     instr[20,16] = Rm
     instr[15,10] = imms :  0xxxxx for 32 bit
     instr[9,5]   = Rn
     instr[4,0]   = Rd  */
  unsigned rm   = INSTR (20, 16);
  unsigned imms = INSTR (15, 10) & 31;
  unsigned rn   = INSTR ( 9,  5);
  unsigned rd   = INSTR ( 4,  0);
  uint64_t val1;
  uint64_t val2;

  val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
  val1 >>= imms;
  val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  val2 <<= (32 - imms);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
}

static void
do_EXTR_64 (sim_cpu *cpu)
{
  /* instr[31:21] = 10010011100
     instr[20,16] = Rm
     instr[15,10] = imms
     instr[9,5]   = Rn
     instr[4,0]   = Rd  */
  unsigned rm   = INSTR (20, 16);
  unsigned imms = INSTR (15, 10) & 63;
  unsigned rn   = INSTR ( 9,  5);
  unsigned rd   = INSTR ( 4,  0);
  uint64_t val;

  val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
  val >>= imms;
  val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));

  aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
}

static void
dexExtractImmediate (sim_cpu *cpu)
{
  /* assert instr[28:23] = 100111
     instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
     instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
     instr[21]    = op0 : must be 0 or UNALLOC
     instr[20,16] = Rm
     instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
     instr[9,5]   = Rn
     instr[4,0]   = Rd  */

  /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
  /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
  uint32_t dispatch;
  uint32_t size = INSTR (31, 31);
  uint32_t N = INSTR (22, 22);
  /* 32 bit operations must have imms[5] = 0
     or else we have an UNALLOC.  */
  uint32_t imms = INSTR (15, 10);

  if (size ^ N)
    HALT_UNALLOC;

  if (!size && uimm (imms, 5, 5))
    HALT_UNALLOC;

  /* Switch on combined size and op.  */
  dispatch = INSTR (31, 29);

  if (dispatch == 0)
    do_EXTR_32 (cpu);

  else if (dispatch == 4)
    do_EXTR_64 (cpu);

  else if (dispatch == 1)
    HALT_NYI;
  else
    HALT_UNALLOC;
}

static void
dexDPImm (sim_cpu *cpu)
{
  /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
     assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
     bits [25,23] of a DPImm are the secondary dispatch vector.  */
  uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));

  switch (group2)
    {
    case DPIMM_PCADR_000:
    case DPIMM_PCADR_001:
      dexPCRelAddressing (cpu);
      return;

    case DPIMM_ADDSUB_010:
    case DPIMM_ADDSUB_011:
      dexAddSubtractImmediate (cpu);
      return;

    case DPIMM_LOG_100:
      dexLogicalImmediate (cpu);
      return;

    case DPIMM_MOV_101:
      dexMoveWideImmediate (cpu);
      return;

    case DPIMM_BITF_110:
      dexBitfieldImmediate (cpu);
      return;

    case DPIMM_EXTR_111:
      dexExtractImmediate (cpu);
      return;

    default:
      /* Should never reach here.  */
      HALT_NYI;
    }
}

static void
dexLoadUnscaledImmediate (sim_cpu *cpu)
{
  /* instr[29,24] == 111_00
     instr[21] == 0
     instr[11,10] == 00
     instr[31,30] = size
     instr[26] = V
     instr[23,22] = opc
     instr[20,12] = simm9
     instr[9,5] = rn may be SP.  */
  /* unsigned rt = INSTR (4, 0);  */
  uint32_t V = INSTR (26, 26);
  uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
  int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);

  if (!V)
    {
      /* GReg operations.  */
      switch (dispatch)
	{
	case 0:	 sturb (cpu, imm); return;
	case 1:	 ldurb32 (cpu, imm); return;
	case 2:	 ldursb64 (cpu, imm); return;
	case 3:	 ldursb32 (cpu, imm); return;
	case 4:	 sturh (cpu, imm); return;
	case 5:	 ldurh32 (cpu, imm); return;
	case 6:	 ldursh64 (cpu, imm); return;
	case 7:	 ldursh32 (cpu, imm); return;
	case 8:	 stur32 (cpu, imm); return;
	case 9:	 ldur32 (cpu, imm); return;
	case 10: ldursw (cpu, imm); return;
	case 12: stur64 (cpu, imm); return;
	case 13: ldur64 (cpu, imm); return;

	case 14:
	  /* PRFUM NYI.  */
	  HALT_NYI;

	default:
	case 11:
	case 15:
	  HALT_UNALLOC;
	}
    }

  /* FReg operations.  */
  switch (dispatch)
    {
    case 2:  fsturq (cpu, imm); return;
    case 3:  fldurq (cpu, imm); return;
    case 8:  fsturs (cpu, imm); return;
    case 9:  fldurs (cpu, imm); return;
    case 12: fsturd (cpu, imm); return;
    case 13: fldurd (cpu, imm); return;

    case 0: /* STUR 8 bit FP.  */
    case 1: /* LDUR 8 bit FP.  */
    case 4: /* STUR 16 bit FP.  */
    case 5: /* LDUR 8 bit FP.  */
      HALT_NYI;

    default:
    case 6:
    case 7:
    case 10:
    case 11:
    case 14:
    case 15:
      HALT_UNALLOC;
    }
}

/*  N.B. A preliminary note regarding all the ldrs<x>32
    instructions

   The signed value loaded by these instructions is cast to unsigned
   before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
   64 bit element of the GReg union. this performs a 32 bit sign extension
   (as required) but avoids 64 bit sign extension, thus ensuring that the
   top half of the register word is zero. this is what the spec demands
   when a 32 bit load occurs.  */

/* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
static void
ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned int rn = INSTR (9, 5);
  unsigned int rt = INSTR (4, 0);

  /* The target register may not be SP but the source may be
     there is no scaling required for a byte load.  */
  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
  aarch64_set_reg_u64 (cpu, rt, NO_SP,
		       (int64_t) aarch64_get_mem_s8 (cpu, address));
}

/* 32 bit load sign-extended byte scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned int rm = INSTR (20, 16);
  unsigned int rn = INSTR (9, 5);
  unsigned int rt = INSTR (4, 0);

  /* rn may reference SP, rm and rt must reference ZR.  */

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
				 extension);

  /* There is no scaling required for a byte load.  */
  aarch64_set_reg_u64
    (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
						   + displacement));
}

/* 32 bit load sign-extended byte unscaled signed 9 bit with
   pre- or post-writeback.  */
static void
ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  uint64_t address;
  unsigned int rn = INSTR (9, 5);
  unsigned int rt = INSTR (4, 0);

  if (rn == rt && wb != NoWriteBack)
    HALT_UNALLOC;

  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb == Pre)
      address += offset;

  aarch64_set_reg_u64 (cpu, rt, NO_SP,
		       (int64_t) aarch64_get_mem_s8 (cpu, address));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
}

/* 8 bit store scaled.  */
static void
fstrb_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned st = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);

  aarch64_set_mem_u8 (cpu,
		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
		      aarch64_get_vec_u8 (cpu, st, 0));
}

/* 8 bit store scaled or unscaled zero- or
   sign-extended 8-bit register offset.  */
static void
fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);

  uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			       extension);
  uint64_t  displacement = scaling == Scaled ? extended : 0;

  aarch64_set_mem_u8
    (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
}

/* 16 bit store scaled.  */
static void
fstrh_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned st = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);

  aarch64_set_mem_u16
    (cpu,
     aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
     aarch64_get_vec_u16 (cpu, st, 0));
}

/* 16 bit store scaled or unscaled zero-
   or sign-extended 16-bit register offset.  */
static void
fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);

  uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			       extension);
  uint64_t  displacement = OPT_SCALE (extended, 16, scaling);

  aarch64_set_mem_u16
    (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
}

/* 32 bit store scaled unsigned 12 bit.  */
static void
fstrs_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned st = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);

  aarch64_set_mem_u32
    (cpu,
     aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
     aarch64_get_vec_u32 (cpu, st, 0));
}

/* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
static void
fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 32 bit store scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);

  uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			       extension);
  uint64_t  displacement = OPT_SCALE (extended, 32, scaling);

  aarch64_set_mem_u32
    (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
}

/* 64 bit store scaled unsigned 12 bit.  */
static void
fstrd_abs (sim_cpu *cpu, uint32_t offset)
{
  unsigned st = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);

  aarch64_set_mem_u64
    (cpu,
     aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
     aarch64_get_vec_u64 (cpu, st, 0));
}

/* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
static void
fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);

  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 64 bit store scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);

  uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			       extension);
  uint64_t  displacement = OPT_SCALE (extended, 64, scaling);

  aarch64_set_mem_u64
    (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
}

/* 128 bit store scaled unsigned 12 bit.  */
static void
fstrq_abs (sim_cpu *cpu, uint32_t offset)
{
  FRegister a;
  unsigned st = INSTR (4, 0);
  unsigned rn = INSTR (9, 5);
  uint64_t addr;

  aarch64_get_FP_long_double (cpu, st, & a);

  addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
  aarch64_set_mem_long_double (cpu, addr, a);
}

/* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
static void
fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  FRegister a;
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);

  if (wb != Post)
    address += offset;

  aarch64_get_FP_long_double (cpu, st, & a);
  aarch64_set_mem_long_double (cpu, address, a);

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
}

/* 128 bit store scaled or unscaled zero-
   or sign-extended 32-bit register offset.  */
static void
fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned st = INSTR (4, 0);

  uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
  int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
			       extension);
  uint64_t  displacement = OPT_SCALE (extended, 128, scaling);

  FRegister a;

  aarch64_get_FP_long_double (cpu, st, & a);
  aarch64_set_mem_long_double (cpu, address + displacement, a);
}

static void
dexLoadImmediatePrePost (sim_cpu *cpu)
{
  /* instr[31,30] = size
     instr[29,27] = 111
     instr[26]    = V
     instr[25,24] = 00
     instr[23,22] = opc
     instr[21]    = 0
     instr[20,12] = simm9
     instr[11]    = wb : 0 ==> Post, 1 ==> Pre
     instr[10]    = 0
     instr[9,5]   = Rn may be SP.
     instr[4,0]   = Rt */

  uint32_t  V        = INSTR (26, 26);
  uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
  int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
  WriteBack wb       = INSTR (11, 11);

  if (!V)
    {
      /* GReg operations.  */
      switch (dispatch)
	{
	case 0:	 strb_wb (cpu, imm, wb); return;
	case 1:	 ldrb32_wb (cpu, imm, wb); return;
	case 2:	 ldrsb_wb (cpu, imm, wb); return;
	case 3:	 ldrsb32_wb (cpu, imm, wb); return;
	case 4:	 strh_wb (cpu, imm, wb); return;
	case 5:	 ldrh32_wb (cpu, imm, wb); return;
	case 6:	 ldrsh64_wb (cpu, imm, wb); return;
	case 7:	 ldrsh32_wb (cpu, imm, wb); return;
	case 8:	 str32_wb (cpu, imm, wb); return;
	case 9:	 ldr32_wb (cpu, imm, wb); return;
	case 10: ldrsw_wb (cpu, imm, wb); return;
	case 12: str_wb (cpu, imm, wb); return;
	case 13: ldr_wb (cpu, imm, wb); return;

	default:
	case 11:
	case 14:
	case 15:
	  HALT_UNALLOC;
	}
    }

  /* FReg operations.  */
  switch (dispatch)
    {
    case 2:  fstrq_wb (cpu, imm, wb); return;
    case 3:  fldrq_wb (cpu, imm, wb); return;
    case 8:  fstrs_wb (cpu, imm, wb); return;
    case 9:  fldrs_wb (cpu, imm, wb); return;
    case 12: fstrd_wb (cpu, imm, wb); return;
    case 13: fldrd_wb (cpu, imm, wb); return;

    case 0:	  /* STUR 8 bit FP.  */
    case 1:	  /* LDUR 8 bit FP.  */
    case 4:	  /* STUR 16 bit FP.  */
    case 5:	  /* LDUR 8 bit FP.  */
      HALT_NYI;

    default:
    case 6:
    case 7:
    case 10:
    case 11:
    case 14:
    case 15:
      HALT_UNALLOC;
    }
}

static void
dexLoadRegisterOffset (sim_cpu *cpu)
{
  /* instr[31,30] = size
     instr[29,27] = 111
     instr[26]    = V
     instr[25,24] = 00
     instr[23,22] = opc
     instr[21]    = 1
     instr[20,16] = rm
     instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
                             110 ==> SXTW, 111 ==> SXTX,
                             ow ==> RESERVED
     instr[12]    = scaled
     instr[11,10] = 10
     instr[9,5]   = rn
     instr[4,0]   = rt.  */

  uint32_t  V = INSTR (26, 26);
  uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
  Scaling   scale = INSTR (12, 12);
  Extension extensionType = INSTR (15, 13);

  /* Check for illegal extension types.  */
  if (uimm (extensionType, 1, 1) == 0)
    HALT_UNALLOC;

  if (extensionType == UXTX || extensionType == SXTX)
    extensionType = NoExtension;

  if (!V)
    {
      /* GReg operations.  */
      switch (dispatch)
	{
	case 0:	 strb_scale_ext (cpu, scale, extensionType); return;
	case 1:	 ldrb32_scale_ext (cpu, scale, extensionType); return;
	case 2:	 ldrsb_scale_ext (cpu, scale, extensionType); return;
	case 3:	 ldrsb32_scale_ext (cpu, scale, extensionType); return;
	case 4:	 strh_scale_ext (cpu, scale, extensionType); return;
	case 5:	 ldrh32_scale_ext (cpu, scale, extensionType); return;
	case 6:	 ldrsh_scale_ext (cpu, scale, extensionType); return;
	case 7:	 ldrsh32_scale_ext (cpu, scale, extensionType); return;
	case 8:	 str32_scale_ext (cpu, scale, extensionType); return;
	case 9:	 ldr32_scale_ext (cpu, scale, extensionType); return;
	case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
	case 12: str_scale_ext (cpu, scale, extensionType); return;
	case 13: ldr_scale_ext (cpu, scale, extensionType); return;
	case 14: prfm_scale_ext (cpu, scale, extensionType); return;

	default:
	case 11:
	case 15:
	  HALT_UNALLOC;
	}
    }

  /* FReg operations.  */
  switch (dispatch)
    {
    case 1: /* LDUR 8 bit FP.  */
      HALT_NYI;
    case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
    case 5: /* LDUR 8 bit FP.  */
      HALT_NYI;
    case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
    case 13: fldrd_scale_ext (cpu, scale, extensionType); return;

    case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
    case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
    case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
    case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
    case 12: fstrd_scale_ext (cpu, scale, extensionType); return;

    default:
    case 6:
    case 7:
    case 10:
    case 11:
    case 14:
    case 15:
      HALT_UNALLOC;
    }
}

static void
dexLoadUnsignedImmediate (sim_cpu *cpu)
{
  /* instr[29,24] == 111_01
     instr[31,30] = size
     instr[26]    = V
     instr[23,22] = opc
     instr[21,10] = uimm12 : unsigned immediate offset
     instr[9,5]   = rn may be SP.
     instr[4,0]   = rt.  */

  uint32_t V = INSTR (26,26);
  uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
  uint32_t imm = INSTR (21, 10);

  if (!V)
    {
      /* GReg operations.  */
      switch (dispatch)
	{
	case 0:  strb_abs (cpu, imm); return;
	case 1:  ldrb32_abs (cpu, imm); return;
	case 2:  ldrsb_abs (cpu, imm); return;
	case 3:  ldrsb32_abs (cpu, imm); return;
	case 4:  strh_abs (cpu, imm); return;
	case 5:  ldrh32_abs (cpu, imm); return;
	case 6:  ldrsh_abs (cpu, imm); return;
	case 7:  ldrsh32_abs (cpu, imm); return;
	case 8:  str32_abs (cpu, imm); return;
	case 9:  ldr32_abs (cpu, imm); return;
	case 10: ldrsw_abs (cpu, imm); return;
	case 12: str_abs (cpu, imm); return;
	case 13: ldr_abs (cpu, imm); return;
	case 14: prfm_abs (cpu, imm); return;

	default:
	case 11:
	case 15:
	  HALT_UNALLOC;
	}
    }

  /* FReg operations.  */
  switch (dispatch)
    {
    case 0:  fstrb_abs (cpu, imm); return;
    case 4:  fstrh_abs (cpu, imm); return;
    case 8:  fstrs_abs (cpu, imm); return;
    case 12: fstrd_abs (cpu, imm); return;
    case 2:  fstrq_abs (cpu, imm); return;

    case 1:  fldrb_abs (cpu, imm); return;
    case 5:  fldrh_abs (cpu, imm); return;
    case 9:  fldrs_abs (cpu, imm); return;
    case 13: fldrd_abs (cpu, imm); return;
    case 3:  fldrq_abs (cpu, imm); return;

    default:
    case 6:
    case 7:
    case 10:
    case 11:
    case 14:
    case 15:
      HALT_UNALLOC;
    }
}

static void
dexLoadExclusive (sim_cpu *cpu)
{
  /* assert instr[29:24] = 001000;
     instr[31,30] = size
     instr[23] = 0 if exclusive
     instr[22] = L : 1 if load, 0 if store
     instr[21] = 1 if pair
     instr[20,16] = Rs
     instr[15] = o0 : 1 if ordered
     instr[14,10] = Rt2
     instr[9,5] = Rn
     instr[4.0] = Rt.  */

  switch (INSTR (22, 21))
    {
    case 2:   ldxr (cpu); return;
    case 0:   stxr (cpu); return;
    default:  HALT_NYI;
    }
}

static void
dexLoadOther (sim_cpu *cpu)
{
  uint32_t dispatch;

  /* instr[29,25] = 111_0
     instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
     instr[21:11,10] is the secondary dispatch.  */
  if (INSTR (24, 24))
    {
      dexLoadUnsignedImmediate (cpu);
      return;
    }

  dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
  switch (dispatch)
    {
    case 0: dexLoadUnscaledImmediate (cpu); return;
    case 1: dexLoadImmediatePrePost (cpu); return;
    case 3: dexLoadImmediatePrePost (cpu); return;
    case 6: dexLoadRegisterOffset (cpu); return;

    default:
    case 2:
    case 4:
    case 5:
    case 7:
      HALT_NYI;
    }
}

static void
store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  if ((rn == rd || rm == rd) && wb != NoWriteBack)
    HALT_UNALLOC; /* ??? */

  offset <<= 2;

  if (wb != Post)
    address += offset;

  aarch64_set_mem_u32 (cpu, address,
		       aarch64_get_reg_u32 (cpu, rm, NO_SP));
  aarch64_set_mem_u32 (cpu, address + 4,
		       aarch64_get_reg_u32 (cpu, rn, NO_SP));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  if ((rn == rd || rm == rd) && wb != NoWriteBack)
    HALT_UNALLOC; /* ??? */

  offset <<= 3;

  if (wb != Post)
    address += offset;

  aarch64_set_mem_u64 (cpu, address,
		       aarch64_get_reg_u64 (cpu, rm, NO_SP));
  aarch64_set_mem_u64 (cpu, address + 8,
		       aarch64_get_reg_u64 (cpu, rn, NO_SP));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  /* Treat this as unalloc to make sure we don't do it.  */
  if (rn == rm)
    HALT_UNALLOC;

  offset <<= 2;

  if (wb != Post)
    address += offset;

  aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
  aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  /* Treat this as unalloc to make sure we don't do it.  */
  if (rn == rm)
    HALT_UNALLOC;

  offset <<= 2;

  if (wb != Post)
    address += offset;

  aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
  aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  /* Treat this as unalloc to make sure we don't do it.  */
  if (rn == rm)
    HALT_UNALLOC;

  offset <<= 3;

  if (wb != Post)
    address += offset;

  aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
  aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
dex_load_store_pair_gr (sim_cpu *cpu)
{
  /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
     instr[29,25] = instruction encoding: 101_0
     instr[26]    = V : 1 if fp 0 if gp
     instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
     instr[22]    = load/store (1=> load)
     instr[21,15] = signed, scaled, offset
     instr[14,10] = Rn
     instr[ 9, 5] = Rd
     instr[ 4, 0] = Rm.  */

  uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
  int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);

  switch (dispatch)
    {
    case 2: store_pair_u32 (cpu, offset, Post); return;
    case 3: load_pair_u32  (cpu, offset, Post); return;
    case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
    case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
    case 6: store_pair_u32 (cpu, offset, Pre); return;
    case 7: load_pair_u32  (cpu, offset, Pre); return;

    case 11: load_pair_s32  (cpu, offset, Post); return;
    case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
    case 15: load_pair_s32  (cpu, offset, Pre); return;

    case 18: store_pair_u64 (cpu, offset, Post); return;
    case 19: load_pair_u64  (cpu, offset, Post); return;
    case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
    case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
    case 22: store_pair_u64 (cpu, offset, Pre); return;
    case 23: load_pair_u64  (cpu, offset, Pre); return;

    default:
      HALT_UNALLOC;
    }
}

static void
store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  offset <<= 2;

  if (wb != Post)
    address += offset;

  aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
  aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  offset <<= 3;

  if (wb != Post)
    address += offset;

  aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
  aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  FRegister a;
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  offset <<= 4;

  if (wb != Post)
    address += offset;

  aarch64_get_FP_long_double (cpu, rm, & a);
  aarch64_set_mem_long_double (cpu, address, a);
  aarch64_get_FP_long_double (cpu, rn, & a);
  aarch64_set_mem_long_double (cpu, address + 16, a);

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  if (rm == rn)
    HALT_UNALLOC;

  offset <<= 2;

  if (wb != Post)
    address += offset;

  aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
  aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  if (rm == rn)
    HALT_UNALLOC;

  offset <<= 3;

  if (wb != Post)
    address += offset;

  aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
  aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
{
  FRegister a;
  unsigned rn = INSTR (14, 10);
  unsigned rd = INSTR (9, 5);
  unsigned rm = INSTR (4, 0);
  uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);

  if (rm == rn)
    HALT_UNALLOC;

  offset <<= 4;

  if (wb != Post)
    address += offset;

  aarch64_get_mem_long_double (cpu, address, & a);
  aarch64_set_FP_long_double (cpu, rm, a);
  aarch64_get_mem_long_double (cpu, address + 16, & a);
  aarch64_set_FP_long_double (cpu, rn, a);

  if (wb == Post)
    address += offset;

  if (wb != NoWriteBack)
    aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
}

static void
dex_load_store_pair_fp (sim_cpu *cpu)
{
  /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
     instr[29,25] = instruction encoding
     instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
     instr[22]    = load/store (1=> load)
     instr[21,15] = signed, scaled, offset
     instr[14,10] = Rn
     instr[ 9, 5] = Rd
     instr[ 4, 0] = Rm  */

  uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
  int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);

  switch (dispatch)
    {
    case 2: store_pair_float (cpu, offset, Post); return;
    case 3: load_pair_float  (cpu, offset, Post); return;
    case 4: store_pair_float (cpu, offset, NoWriteBack); return;
    case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
    case 6: store_pair_float (cpu, offset, Pre); return;
    case 7: load_pair_float  (cpu, offset, Pre); return;

    case 10: store_pair_double (cpu, offset, Post); return;
    case 11: load_pair_double  (cpu, offset, Post); return;
    case 12: store_pair_double (cpu, offset, NoWriteBack); return;
    case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
    case 14: store_pair_double (cpu, offset, Pre); return;
    case 15: load_pair_double  (cpu, offset, Pre); return;

    case 18: store_pair_long_double (cpu, offset, Post); return;
    case 19: load_pair_long_double  (cpu, offset, Post); return;
    case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
    case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
    case 22: store_pair_long_double (cpu, offset, Pre); return;
    case 23: load_pair_long_double  (cpu, offset, Pre); return;

    default:
      HALT_UNALLOC;
    }
}

static inline unsigned
vec_reg (unsigned v, unsigned o)
{
  return (v + o) & 0x3F;
}

/* Load multiple N-element structures to M consecutive registers.  */
static void
vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
{
  int      all  = INSTR (30, 30);
  unsigned size = INSTR (11, 10);
  unsigned vd   = INSTR (4, 0);
  unsigned rpt = (N == M) ? 1 : M;
  unsigned selem = N;
  unsigned i, j, k;

  switch (size)
    {
    case 0: /* 8-bit operations.  */
      for (i = 0; i < rpt; i++)
	for (j = 0; j < (8 + (8 * all)); j++)
	  for (k = 0; k < selem; k++)
	    {
	      aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
				  aarch64_get_mem_u8 (cpu, address));
	      address += 1;
	    }
      return;

    case 1: /* 16-bit operations.  */
      for (i = 0; i < rpt; i++)
	for (j = 0; j < (4 + (4 * all)); j++)
	  for (k = 0; k < selem; k++)
	    {
	      aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
				   aarch64_get_mem_u16 (cpu, address));
	      address += 2;
	    }
      return;

    case 2: /* 32-bit operations.  */
      for (i = 0; i < rpt; i++)
	for (j = 0; j < (2 + (2 * all)); j++)
	  for (k = 0; k < selem; k++)
	    {
	      aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
				   aarch64_get_mem_u32 (cpu, address));
	      address += 4;
	    }
      return;

    case 3: /* 64-bit operations.  */
      for (i = 0; i < rpt; i++)
	for (j = 0; j < (1 + all); j++)
	  for (k = 0; k < selem; k++)
	    {
	      aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
				   aarch64_get_mem_u64 (cpu, address));
	      address += 8;
	    }
      return;
    }
}

/* Load multiple 4-element structures into four consecutive registers.  */
static void
LD4 (sim_cpu *cpu, uint64_t address)
{
  vec_load (cpu, address, 4, 4);
}

/* Load multiple 3-element structures into three consecutive registers.  */
static void
LD3 (sim_cpu *cpu, uint64_t address)
{
  vec_load (cpu, address, 3, 3);
}

/* Load multiple 2-element structures into two consecutive registers.  */
static void
LD2 (sim_cpu *cpu, uint64_t address)
{
  vec_load (cpu, address, 2, 2);
}

/* Load multiple 1-element structures into one register.  */
static void
LD1_1 (sim_cpu *cpu, uint64_t address)
{
  vec_load (cpu, address, 1, 1);
}

/* Load multiple 1-element structures into two registers.  */
static void
LD1_2 (sim_cpu *cpu, uint64_t address)
{
  vec_load (cpu, address, 1, 2);
}

/* Load multiple 1-element structures into three registers.  */
static void
LD1_3 (sim_cpu *cpu, uint64_t address)
{
  vec_load (cpu, address, 1, 3);
}

/* Load multiple 1-element structures into four registers.  */
static void
LD1_4 (sim_cpu *cpu, uint64_t address)
{
  vec_load (cpu, address, 1, 4);
}

/* Store multiple N-element structures from M consecutive registers.  */
static void
vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
{
  int      all  = INSTR (30, 30);
  unsigned size = INSTR (11, 10);
  unsigned vd   = INSTR (4, 0);
  unsigned rpt = (N == M) ? 1 : M;
  unsigned selem = N;
  unsigned i, j, k;

  switch (size)
    {
    case 0: /* 8-bit operations.  */
      for (i = 0; i < rpt; i++)
	for (j = 0; j < (8 + (8 * all)); j++)
	  for (k = 0; k < selem; k++)
	    {
	      aarch64_set_mem_u8
		(cpu, address,
		 aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
	      address += 1;
	    }
      return;

    case 1: /* 16-bit operations.  */
      for (i = 0; i < rpt; i++)
	for (j = 0; j < (4 + (4 * all)); j++)
	  for (k = 0; k < selem; k++)
	    {
	      aarch64_set_mem_u16
		(cpu, address,
		 aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
	      address += 2;
	    }
      return;

    case 2: /* 32-bit operations.  */
      for (i = 0; i < rpt; i++)
	for (j = 0; j < (2 + (2 * all)); j++)
	  for (k = 0; k < selem; k++)
	    {
	      aarch64_set_mem_u32
		(cpu, address,
		 aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
	      address += 4;
	    }
      return;

    case 3: /* 64-bit operations.  */
      for (i = 0; i < rpt; i++)
	for (j = 0; j < (1 + all); j++)
	  for (k = 0; k < selem; k++)
	    {
	      aarch64_set_mem_u64
		(cpu, address,
		 aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
	      address += 8;
	    }
      return;
    }
}

/* Store multiple 4-element structure from four consecutive registers.  */
static void
ST4 (sim_cpu *cpu, uint64_t address)
{
  vec_store (cpu, address, 4, 4);
}

/* Store multiple 3-element structures from three consecutive registers.  */
static void
ST3 (sim_cpu *cpu, uint64_t address)
{
  vec_store (cpu, address, 3, 3);
}

/* Store multiple 2-element structures from two consecutive registers.  */
static void
ST2 (sim_cpu *cpu, uint64_t address)
{
  vec_store (cpu, address, 2, 2);
}

/* Store multiple 1-element structures from one register.  */
static void
ST1_1 (sim_cpu *cpu, uint64_t address)
{
  vec_store (cpu, address, 1, 1);
}

/* Store multiple 1-element structures from two registers.  */
static void
ST1_2 (sim_cpu *cpu, uint64_t address)
{
  vec_store (cpu, address, 1, 2);
}

/* Store multiple 1-element structures from three registers.  */
static void
ST1_3 (sim_cpu *cpu, uint64_t address)
{
  vec_store (cpu, address, 1, 3);
}

/* Store multiple 1-element structures from four registers.  */
static void
ST1_4 (sim_cpu *cpu, uint64_t address)
{
  vec_store (cpu, address, 1, 4);
}

#define LDn_STn_SINGLE_LANE_AND_SIZE()				\
  do								\
    {								\
      switch (INSTR (15, 14))					\
	{							\
	case 0:							\
	  lane = (full << 3) | (s << 2) | size;			\
	  size = 0;						\
	  break;						\
								\
	case 1:							\
	  if ((size & 1) == 1)					\
	    HALT_UNALLOC;					\
	  lane = (full << 2) | (s << 1) | (size >> 1);		\
	  size = 1;						\
	  break;						\
								\
	case 2:							\
	  if ((size & 2) == 2)					\
	    HALT_UNALLOC;					\
								\
	  if ((size & 1) == 0)					\
	    {							\
	      lane = (full << 1) | s;				\
	      size = 2;						\
	    }							\
	  else							\
	    {							\
	      if (s)						\
		HALT_UNALLOC;					\
	      lane = full;					\
	      size = 3;						\
	    }							\
	  break;						\
								\
	default:						\
	  HALT_UNALLOC;						\
	}							\
    }								\
  while (0)

/* Load single structure into one lane of N registers.  */
static void
do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
{
  /* instr[31]    = 0
     instr[30]    = element selector 0=>half, 1=>all elements
     instr[29,24] = 00 1101
     instr[23]    = 0=>simple, 1=>post
     instr[22]    = 1
     instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
     instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
                      11111 (immediate post inc)
     instr[15,13] = opcode
     instr[12]    = S, used for lane number
     instr[11,10] = size, also used for lane number
     instr[9,5]   = address
     instr[4,0]   = Vd  */

  unsigned full = INSTR (30, 30);
  unsigned vd = INSTR (4, 0);
  unsigned size = INSTR (11, 10);
  unsigned s = INSTR (12, 12);
  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
  int lane = 0;
  int i;

  NYI_assert (29, 24, 0x0D);
  NYI_assert (22, 22, 1);

  /* Compute the lane number first (using size), and then compute size.  */
  LDn_STn_SINGLE_LANE_AND_SIZE ();

  for (i = 0; i < nregs; i++)
    switch (size)
      {
      case 0:
	{
	  uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
	  aarch64_set_vec_u8 (cpu, vd + i, lane, val);
	  break;
	}

      case 1:
	{
	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
	  aarch64_set_vec_u16 (cpu, vd + i, lane, val);
	  break;
	}

      case 2:
	{
	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
	  aarch64_set_vec_u32 (cpu, vd + i, lane, val);
	  break;
	}

      case 3:
	{
	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
	  aarch64_set_vec_u64 (cpu, vd + i, lane, val);
	  break;
	}
      }
}

/* Store single structure from one lane from N registers.  */
static void
do_vec_STn_single (sim_cpu *cpu, uint64_t address)
{
  /* instr[31]    = 0
     instr[30]    = element selector 0=>half, 1=>all elements
     instr[29,24] = 00 1101
     instr[23]    = 0=>simple, 1=>post
     instr[22]    = 0
     instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
     instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
                      11111 (immediate post inc)
     instr[15,13] = opcode
     instr[12]    = S, used for lane number
     instr[11,10] = size, also used for lane number
     instr[9,5]   = address
     instr[4,0]   = Vd  */

  unsigned full = INSTR (30, 30);
  unsigned vd = INSTR (4, 0);
  unsigned size = INSTR (11, 10);
  unsigned s = INSTR (12, 12);
  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
  int lane = 0;
  int i;

  NYI_assert (29, 24, 0x0D);
  NYI_assert (22, 22, 0);

  /* Compute the lane number first (using size), and then compute size.  */
  LDn_STn_SINGLE_LANE_AND_SIZE ();

  for (i = 0; i < nregs; i++)
    switch (size)
      {
      case 0:
	{
	  uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
	  aarch64_set_mem_u8 (cpu, address + i, val);
	  break;
	}

      case 1:
	{
	  uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
	  aarch64_set_mem_u16 (cpu, address + (i * 2), val);
	  break;
	}

      case 2:
	{
	  uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
	  aarch64_set_mem_u32 (cpu, address + (i * 4), val);
	  break;
	}

      case 3:
	{
	  uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
	  aarch64_set_mem_u64 (cpu, address + (i * 8), val);
	  break;
	}
      }
}

/* Load single structure into all lanes of N registers.  */
static void
do_vec_LDnR (sim_cpu *cpu, uint64_t address)
{
  /* instr[31]    = 0
     instr[30]    = element selector 0=>half, 1=>all elements
     instr[29,24] = 00 1101
     instr[23]    = 0=>simple, 1=>post
     instr[22]    = 1
     instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
     instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
                      11111 (immediate post inc)
     instr[15,14] = 11
     instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
     instr[12]    = 0
     instr[11,10] = element size 00=> byte(b), 01=> half(h),
                                 10=> word(s), 11=> double(d)
     instr[9,5]   = address
     instr[4,0]   = Vd  */

  unsigned full = INSTR (30, 30);
  unsigned vd = INSTR (4, 0);
  unsigned size = INSTR (11, 10);
  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
  int i, n;

  NYI_assert (29, 24, 0x0D);
  NYI_assert (22, 22, 1);
  NYI_assert (15, 14, 3);
  NYI_assert (12, 12, 0);

  for (n = 0; n < nregs; n++)
    switch (size)
      {
      case 0:
	{
	  uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
	  for (i = 0; i < (full ? 16 : 8); i++)
	    aarch64_set_vec_u8 (cpu, vd + n, i, val);
	  break;
	}

      case 1:
	{
	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
	  for (i = 0; i < (full ? 8 : 4); i++)
	    aarch64_set_vec_u16 (cpu, vd + n, i, val);
	  break;
	}

      case 2:
	{
	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
	  for (i = 0; i < (full ? 4 : 2); i++)
	    aarch64_set_vec_u32 (cpu, vd + n, i, val);
	  break;
	}

      case 3:
	{
	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
	  for (i = 0; i < (full ? 2 : 1); i++)
	    aarch64_set_vec_u64 (cpu, vd + n, i, val);
	  break;
	}

      default:
	HALT_UNALLOC;
      }
}

static void
do_vec_load_store (sim_cpu *cpu)
{
  /* {LD|ST}<N>   {Vd..Vd+N}, vaddr

     instr[31]    = 0
     instr[30]    = element selector 0=>half, 1=>all elements
     instr[29,25] = 00110
     instr[24]    = 0=>multiple struct, 1=>single struct
     instr[23]    = 0=>simple, 1=>post
     instr[22]    = 0=>store, 1=>load
     instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
     instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
                    11111 (immediate post inc)
     instr[15,12] = elements and destinations.  eg for load:
                     0000=>LD4 => load multiple 4-element to
		     four consecutive registers
                     0100=>LD3 => load multiple 3-element to
		     three consecutive registers
                     1000=>LD2 => load multiple 2-element to
		     two consecutive registers
                     0010=>LD1 => load multiple 1-element to
		     four consecutive registers
                     0110=>LD1 => load multiple 1-element to
		     three consecutive registers
                     1010=>LD1 => load multiple 1-element to
		     two consecutive registers
                     0111=>LD1 => load multiple 1-element to
		     one register
                     1100=>LDR1,LDR2
                     1110=>LDR3,LDR4
     instr[11,10] = element size 00=> byte(b), 01=> half(h),
                                 10=> word(s), 11=> double(d)
     instr[9,5]   = Vn, can be SP
     instr[4,0]   = Vd  */

  int single;
  int post;
  int load;
  unsigned vn;
  uint64_t address;
  int type;

  if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
    HALT_NYI;

  single = INSTR (24, 24);
  post = INSTR (23, 23);
  load = INSTR (22, 22);
  type = INSTR (15, 12);
  vn = INSTR (9, 5);
  address = aarch64_get_reg_u64 (cpu, vn, SP_OK);

  if (! single && INSTR (21, 21) != 0)
    HALT_UNALLOC;

  if (post)
    {
      unsigned vm = INSTR (20, 16);

      if (vm == R31)
	{
	  unsigned sizeof_operation;

	  if (single)
	    {
	      if ((type >= 0) && (type <= 11))
		{
		  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
		  switch (INSTR (15, 14))
		    {
		    case 0:
		      sizeof_operation = nregs * 1;
		      break;
		    case 1:
		      sizeof_operation = nregs * 2;
		      break;
		    case 2:
		      if (INSTR (10, 10) == 0)
			sizeof_operation = nregs * 4;
		      else
			sizeof_operation = nregs * 8;
		      break;
		    default:
		      HALT_UNALLOC;
		    }
		}
	      else if (type == 0xC)
		{
		  sizeof_operation = INSTR (21, 21) ? 2 : 1;
		  sizeof_operation <<= INSTR (11, 10);
		}
	      else if (type == 0xE)
		{
		  sizeof_operation = INSTR (21, 21) ? 4 : 3;
		  sizeof_operation <<= INSTR (11, 10);
		}
	      else
		HALT_UNALLOC;
	    }
	  else
	    {
	      switch (type)
		{
		case 0: sizeof_operation = 32; break;
		case 4: sizeof_operation = 24; break;
		case 8: sizeof_operation = 16; break;

		case 7:
		  /* One register, immediate offset variant.  */
		  sizeof_operation = 8;
		  break;

		case 10:
		  /* Two registers, immediate offset variant.  */
		  sizeof_operation = 16;
		  break;

		case 6:
		  /* Three registers, immediate offset variant.  */
		  sizeof_operation = 24;
		  break;

		case 2:
		  /* Four registers, immediate offset variant.  */
		  sizeof_operation = 32;
		  break;

		default:
		  HALT_UNALLOC;
		}

	      if (INSTR (30, 30))
		sizeof_operation *= 2;
	    }

	  aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
	}
      else
	aarch64_set_reg_u64 (cpu, vn, SP_OK,
			     address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
    }
  else
    {
      NYI_assert (20, 16, 0);
    }

  if (single)
    {
      if (load)
	{
	  if ((type >= 0) && (type <= 11))
	    do_vec_LDn_single (cpu, address);
	  else if ((type == 0xC) || (type == 0xE))
	    do_vec_LDnR (cpu, address);
	  else
	    HALT_UNALLOC;
	  return;
	}

      /* Stores.  */
      if ((type >= 0) && (type <= 11))
	{
	  do_vec_STn_single (cpu, address);
	  return;
	}

      HALT_UNALLOC;
    }

  if (load)
    {
      switch (type)
	{
	case 0:  LD4 (cpu, address); return;
	case 4:  LD3 (cpu, address); return;
	case 8:  LD2 (cpu, address); return;
	case 2:  LD1_4 (cpu, address); return;
	case 6:  LD1_3 (cpu, address); return;
	case 10: LD1_2 (cpu, address); return;
	case 7:  LD1_1 (cpu, address); return;

	default:
	  HALT_UNALLOC;
	}
    }

  /* Stores.  */
  switch (type)
    {
    case 0:  ST4 (cpu, address); return;
    case 4:  ST3 (cpu, address); return;
    case 8:  ST2 (cpu, address); return;
    case 2:  ST1_4 (cpu, address); return;
    case 6:  ST1_3 (cpu, address); return;
    case 10: ST1_2 (cpu, address); return;
    case 7:  ST1_1 (cpu, address); return;
    default:
      HALT_UNALLOC;
    }
}

static void
dexLdSt (sim_cpu *cpu)
{
  /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
     assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
             group == GROUP_LDST_1100 || group == GROUP_LDST_1110
     bits [29,28:26] of a LS are the secondary dispatch vector.  */
  uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));

  switch (group2)
    {
    case LS_EXCL_000:
      dexLoadExclusive (cpu); return;

    case LS_LIT_010:
    case LS_LIT_011:
      dexLoadLiteral (cpu); return;

    case LS_OTHER_110:
    case LS_OTHER_111:
      dexLoadOther (cpu); return;

    case LS_ADVSIMD_001:
      do_vec_load_store (cpu); return;

    case LS_PAIR_100:
      dex_load_store_pair_gr (cpu); return;

    case LS_PAIR_101:
      dex_load_store_pair_fp (cpu); return;

    default:
      /* Should never reach here.  */
      HALT_NYI;
    }
}

/* Specific decode and execute for group Data Processing Register.  */

static void
dexLogicalShiftedRegister (sim_cpu *cpu)
{
  /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30,29] = op
     instr[28:24] = 01010
     instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
     instr[21]    = N
     instr[20,16] = Rm
     instr[15,10] = count : must be 0xxxxx for 32 bit
     instr[9,5]   = Rn
     instr[4,0]   = Rd  */

  uint32_t size      = INSTR (31, 31);
  Shift    shiftType = INSTR (23, 22);
  uint32_t count     = INSTR (15, 10);

  /* 32 bit operations must have count[5] = 0.
     or else we have an UNALLOC.  */
  if (size == 0 && uimm (count, 5, 5))
    HALT_UNALLOC;

  /* Dispatch on size:op:N.  */
  switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
    {
    case 0: and32_shift  (cpu, shiftType, count); return;
    case 1: bic32_shift  (cpu, shiftType, count); return;
    case 2: orr32_shift  (cpu, shiftType, count); return;
    case 3: orn32_shift  (cpu, shiftType, count); return;
    case 4: eor32_shift  (cpu, shiftType, count); return;
    case 5: eon32_shift  (cpu, shiftType, count); return;
    case 6: ands32_shift (cpu, shiftType, count); return;
    case 7: bics32_shift (cpu, shiftType, count); return;
    case 8: and64_shift  (cpu, shiftType, count); return;
    case 9: bic64_shift  (cpu, shiftType, count); return;
    case 10:orr64_shift  (cpu, shiftType, count); return;
    case 11:orn64_shift  (cpu, shiftType, count); return;
    case 12:eor64_shift  (cpu, shiftType, count); return;
    case 13:eon64_shift  (cpu, shiftType, count); return;
    case 14:ands64_shift (cpu, shiftType, count); return;
    case 15:bics64_shift (cpu, shiftType, count); return;
    }
}

/* 32 bit conditional select.  */
static void
csel32 (sim_cpu *cpu, CondCode cc)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       testConditionCode (cpu, cc)
		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       : aarch64_get_reg_u32 (cpu, rm, NO_SP));
}

/* 64 bit conditional select.  */
static void
csel64 (sim_cpu *cpu, CondCode cc)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       testConditionCode (cpu, cc)
		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       : aarch64_get_reg_u64 (cpu, rm, NO_SP));
}

/* 32 bit conditional increment.  */
static void
csinc32 (sim_cpu *cpu, CondCode cc)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       testConditionCode (cpu, cc)
		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
}

/* 64 bit conditional increment.  */
static void
csinc64 (sim_cpu *cpu, CondCode cc)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       testConditionCode (cpu, cc)
		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
}

/* 32 bit conditional invert.  */
static void
csinv32 (sim_cpu *cpu, CondCode cc)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       testConditionCode (cpu, cc)
		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
}

/* 64 bit conditional invert.  */
static void
csinv64 (sim_cpu *cpu, CondCode cc)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       testConditionCode (cpu, cc)
		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
}

/* 32 bit conditional negate.  */
static void
csneg32 (sim_cpu *cpu, CondCode cc)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       testConditionCode (cpu, cc)
		       ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
}

/* 64 bit conditional negate.  */
static void
csneg64 (sim_cpu *cpu, CondCode cc)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       testConditionCode (cpu, cc)
		       ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
}

static void
dexCondSelect (sim_cpu *cpu)
{
  /* instr[28,21] = 11011011
     instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
                            100 ==> CSINV, 101 ==> CSNEG,
                            _1_ ==> UNALLOC
     instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
     instr[15,12] = cond
     instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */

  CondCode cc = INSTR (15, 12);
  uint32_t S = INSTR (29, 29);
  uint32_t op2 = INSTR (11, 10);

  if (S == 1)
    HALT_UNALLOC;

  if (op2 & 0x2)
    HALT_UNALLOC;

  switch ((INSTR (31, 30) << 1) | op2)
    {
    case 0: csel32  (cpu, cc); return;
    case 1: csinc32 (cpu, cc); return;
    case 2: csinv32 (cpu, cc); return;
    case 3: csneg32 (cpu, cc); return;
    case 4: csel64  (cpu, cc); return;
    case 5: csinc64 (cpu, cc); return;
    case 6: csinv64 (cpu, cc); return;
    case 7: csneg64 (cpu, cc); return;
    }
}

/* Some helpers for counting leading 1 or 0 bits.  */

/* Counts the number of leading bits which are the same
   in a 32 bit value in the range 1 to 32.  */
static uint32_t
leading32 (uint32_t value)
{
  int32_t mask= 0xffff0000;
  uint32_t count= 16; /* Counts number of bits set in mask.  */
  uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
  uint32_t hi = 32;   /* Upper bound for number of sign bits.  */

  while (lo + 1 < hi)
    {
      int32_t test = (value & mask);

      if (test == 0 || test == mask)
	{
	  lo = count;
	  count = (lo + hi) / 2;
	  mask >>= (count - lo);
	}
      else
	{
	  hi = count;
	  count = (lo + hi) / 2;
	  mask <<= hi - count;
	}
    }

  if (lo != hi)
    {
      int32_t test;

      mask >>= 1;
      test = (value & mask);

      if (test == 0 || test == mask)
	count = hi;
      else
	count = lo;
    }

  return count;
}

/* Counts the number of leading bits which are the same
   in a 64 bit value in the range 1 to 64.  */
static uint64_t
leading64 (uint64_t value)
{
  int64_t mask= 0xffffffff00000000LL;
  uint64_t count = 32; /* Counts number of bits set in mask.  */
  uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
  uint64_t hi = 64;    /* Upper bound for number of sign bits.  */

  while (lo + 1 < hi)
    {
      int64_t test = (value & mask);

      if (test == 0 || test == mask)
	{
	  lo = count;
	  count = (lo + hi) / 2;
	  mask >>= (count - lo);
	}
      else
	{
	  hi = count;
	  count = (lo + hi) / 2;
	  mask <<= hi - count;
	}
    }

  if (lo != hi)
    {
      int64_t test;

      mask >>= 1;
      test = (value & mask);

      if (test == 0 || test == mask)
	count = hi;
      else
	count = lo;
    }

  return count;
}

/* Bit operations.  */
/* N.B register args may not be SP.  */

/* 32 bit count leading sign bits.  */
static void
cls32 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  /* N.B. the result needs to exclude the leading bit.  */
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
}

/* 64 bit count leading sign bits.  */
static void
cls64 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  /* N.B. the result needs to exclude the leading bit.  */
  aarch64_set_reg_u64
    (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
}

/* 32 bit count leading zero bits.  */
static void
clz32 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);

  /* if the sign (top) bit is set then the count is 0.  */
  if (pick32 (value, 31, 31))
    aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
  else
    aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
}

/* 64 bit count leading zero bits.  */
static void
clz64 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);

  /* if the sign (top) bit is set then the count is 0.  */
  if (pick64 (value, 63, 63))
    aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
  else
    aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
}

/* 32 bit reverse bits.  */
static void
rbit32 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t result = 0;
  int i;

  for (i = 0; i < 32; i++)
    {
      result <<= 1;
      result |= (value & 1);
      value >>= 1;
    }
  aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
}

/* 64 bit reverse bits.  */
static void
rbit64 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t result = 0;
  int i;

  for (i = 0; i < 64; i++)
    {
      result <<= 1;
      result |= (value & 1UL);
      value >>= 1;
    }
  aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
}

/* 32 bit reverse bytes.  */
static void
rev32 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t result = 0;
  int i;

  for (i = 0; i < 4; i++)
    {
      result <<= 8;
      result |= (value & 0xff);
      value >>= 8;
    }
  aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
}

/* 64 bit reverse bytes.  */
static void
rev64 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t result = 0;
  int i;

  for (i = 0; i < 8; i++)
    {
      result <<= 8;
      result |= (value & 0xffULL);
      value >>= 8;
    }
  aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
}

/* 32 bit reverse shorts.  */
/* N.B.this reverses the order of the bytes in each half word.  */
static void
revh32 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint32_t result = 0;
  int i;

  for (i = 0; i < 2; i++)
    {
      result <<= 8;
      result |= (value & 0x00ff00ff);
      value >>= 8;
    }
  aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
}

/* 64 bit reverse shorts.  */
/* N.B.this reverses the order of the bytes in each half word.  */
static void
revh64 (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  uint64_t result = 0;
  int i;

  for (i = 0; i < 2; i++)
    {
      result <<= 8;
      result |= (value & 0x00ff00ff00ff00ffULL);
      value >>= 8;
    }
  aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
}

static void
dexDataProc1Source (sim_cpu *cpu)
{
  /* instr[30]    = 1
     instr[28,21] = 111010110
     instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
     instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
     instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
                             000010 ==> REV, 000011 ==> UNALLOC
                             000100 ==> CLZ, 000101 ==> CLS
                             ow ==> UNALLOC
     instr[9,5]   = rn : may not be SP
     instr[4,0]   = rd : may not be SP.  */

  uint32_t S = INSTR (29, 29);
  uint32_t opcode2 = INSTR (20, 16);
  uint32_t opcode = INSTR (15, 10);
  uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);

  if (S == 1)
    HALT_UNALLOC;

  if (opcode2 != 0)
    HALT_UNALLOC;

  if (opcode & 0x38)
    HALT_UNALLOC;

  switch (dispatch)
    {
    case 0: rbit32 (cpu); return;
    case 1: revh32 (cpu); return;
    case 2: rev32 (cpu); return;
    case 4: clz32 (cpu); return;
    case 5: cls32 (cpu); return;
    case 8: rbit64 (cpu); return;
    case 9: revh64 (cpu); return;
    case 10:rev32 (cpu); return;
    case 11:rev64 (cpu); return;
    case 12:clz64 (cpu); return;
    case 13:cls64 (cpu); return;
    default: HALT_UNALLOC;
    }
}

/* Variable shift.
   Shifts by count supplied in register.
   N.B register args may not be SP.
   These all use the shifted auxiliary function for
   simplicity and clarity.  Writing the actual shift
   inline would avoid a branch and so be faster but
   would also necessitate getting signs right.  */

/* 32 bit arithmetic shift right.  */
static void
asrv32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
}

/* 64 bit arithmetic shift right.  */
static void
asrv64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
}

/* 32 bit logical shift left.  */
static void
lslv32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
}

/* 64 bit arithmetic shift left.  */
static void
lslv64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
}

/* 32 bit logical shift right.  */
static void
lsrv32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
}

/* 64 bit logical shift right.  */
static void
lsrv64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
}

/* 32 bit rotate right.  */
static void
rorv32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
		(aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
}

/* 64 bit rotate right.  */
static void
rorv64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
		(aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
}


/* divide.  */

/* 32 bit signed divide.  */
static void
cpuiv32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  /* N.B. the pseudo-code does the divide using 64 bit data.  */
  /* TODO : check that this rounds towards zero as required.  */
  int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
  int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);

  aarch64_set_reg_s64 (cpu, rd, NO_SP,
		       divisor ? ((int32_t) (dividend / divisor)) : 0);
}

/* 64 bit signed divide.  */
static void
cpuiv64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  /* TODO : check that this rounds towards zero as required.  */
  int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);

  aarch64_set_reg_s64
    (cpu, rd, NO_SP,
     divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
}

/* 32 bit unsigned divide.  */
static void
udiv32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  /* N.B. the pseudo-code does the divide using 64 bit data.  */
  uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
  uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);

  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       divisor ? (uint32_t) (dividend / divisor) : 0);
}

/* 64 bit unsigned divide.  */
static void
udiv64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  /* TODO : check that this rounds towards zero as required.  */
  uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);

  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
}

static void
dexDataProc2Source (sim_cpu *cpu)
{
  /* assert instr[30] == 0
     instr[28,21] == 11010110
     instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
     instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
     instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
                             001000 ==> LSLV, 001001 ==> LSRV
                             001010 ==> ASRV, 001011 ==> RORV
                             ow ==> UNALLOC.  */

  uint32_t dispatch;
  uint32_t S = INSTR (29, 29);
  uint32_t opcode = INSTR (15, 10);

  if (S == 1)
    HALT_UNALLOC;

  if (opcode & 0x34)
    HALT_UNALLOC;

  dispatch = (  (INSTR (31, 31) << 3)
	      | (uimm (opcode, 3, 3) << 2)
	      |  uimm (opcode, 1, 0));
  switch (dispatch)
    {
    case 2:  udiv32 (cpu); return;
    case 3:  cpuiv32 (cpu); return;
    case 4:  lslv32 (cpu); return;
    case 5:  lsrv32 (cpu); return;
    case 6:  asrv32 (cpu); return;
    case 7:  rorv32 (cpu); return;
    case 10: udiv64 (cpu); return;
    case 11: cpuiv64 (cpu); return;
    case 12: lslv64 (cpu); return;
    case 13: lsrv64 (cpu); return;
    case 14: asrv64 (cpu); return;
    case 15: rorv64 (cpu); return;
    default: HALT_UNALLOC;
    }
}


/* Multiply.  */

/* 32 bit multiply and add.  */
static void
madd32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned ra = INSTR (14, 10);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
		       + aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
}

/* 64 bit multiply and add.  */
static void
madd64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned ra = INSTR (14, 10);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
		       + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
			  * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
}

/* 32 bit multiply and sub.  */
static void
msub32 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned ra = INSTR (14, 10);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
		       - aarch64_get_reg_u32 (cpu, rn, NO_SP)
		       * aarch64_get_reg_u32 (cpu, rm, NO_SP));
}

/* 64 bit multiply and sub.  */
static void
msub64 (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned ra = INSTR (14, 10);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
		       - aarch64_get_reg_u64 (cpu, rn, NO_SP)
		       * aarch64_get_reg_u64 (cpu, rm, NO_SP));
}

/* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
static void
smaddl (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned ra = INSTR (14, 10);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  /* N.B. we need to multiply the signed 32 bit values in rn, rm to
     obtain a 64 bit product.  */
  aarch64_set_reg_s64
    (cpu, rd, NO_SP,
     aarch64_get_reg_s64 (cpu, ra, NO_SP)
     + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
     * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
}

/* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
static void
smsubl (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned ra = INSTR (14, 10);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  /* N.B. we need to multiply the signed 32 bit values in rn, rm to
     obtain a 64 bit product.  */
  aarch64_set_reg_s64
    (cpu, rd, NO_SP,
     aarch64_get_reg_s64 (cpu, ra, NO_SP)
     - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
     * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
}

/* Integer Multiply/Divide.  */

/* First some macros and a helper function.  */
/* Macros to test or access elements of 64 bit words.  */

/* Mask used to access lo 32 bits of 64 bit unsigned int.  */
#define LOW_WORD_MASK ((1ULL << 32) - 1)
/* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
#define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
/* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
#define highWordToU64(_value_u64) ((_value_u64) >> 32)

/* Offset of sign bit in 64 bit signed integger.  */
#define SIGN_SHIFT_U64 63
/* The sign bit itself -- also identifies the minimum negative int value.  */
#define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
/* Return true if a 64 bit signed int presented as an unsigned int is the
   most negative value.  */
#define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
/* Return true (non-zero) if a 64 bit signed int presented as an unsigned
   int has its sign bit set to false.  */
#define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
/* Return 1L or -1L according to whether a 64 bit signed int presented as
   an unsigned int has its sign bit set or not.  */
#define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
/* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
#define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)

/* Multiply two 64 bit ints and return.
   the hi 64 bits of the 128 bit product.  */

static uint64_t
mul64hi (uint64_t value1, uint64_t value2)
{
  uint64_t resultmid1;
  uint64_t result;
  uint64_t value1_lo = lowWordToU64 (value1);
  uint64_t value1_hi = highWordToU64 (value1) ;
  uint64_t value2_lo = lowWordToU64 (value2);
  uint64_t value2_hi = highWordToU64 (value2);

  /* Cross-multiply and collect results.  */
  uint64_t xproductlo = value1_lo * value2_lo;
  uint64_t xproductmid1 = value1_lo * value2_hi;
  uint64_t xproductmid2 = value1_hi * value2_lo;
  uint64_t xproducthi = value1_hi * value2_hi;
  uint64_t carry = 0;
  /* Start accumulating 64 bit results.  */
  /* Drop bottom half of lowest cross-product.  */
  uint64_t resultmid = xproductlo >> 32;
  /* Add in middle products.  */
  resultmid = resultmid + xproductmid1;

  /* Check for overflow.  */
  if (resultmid < xproductmid1)
    /* Carry over 1 into top cross-product.  */
    carry++;

  resultmid1  = resultmid + xproductmid2;

  /* Check for overflow.  */
  if (resultmid1 < xproductmid2)
    /* Carry over 1 into top cross-product.  */
    carry++;

  /* Drop lowest 32 bits of middle cross-product.  */
  result = resultmid1 >> 32;
  /* Move carry bit to just above middle cross-product highest bit.  */
  carry = carry << 32;

  /* Add top cross-product plus and any carry.  */
  result += xproducthi + carry;

  return result;
}

/* Signed multiply high, source, source2 :
   64 bit, dest <-- high 64-bit of result.  */
static void
smulh (sim_cpu *cpu)
{
  uint64_t uresult;
  int64_t  result;
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  GReg     ra = INSTR (14, 10);
  int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
  int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
  uint64_t uvalue1;
  uint64_t uvalue2;
  int  negate = 0;

  if (ra != R31)
    HALT_UNALLOC;

  /* Convert to unsigned and use the unsigned mul64hi routine
     the fix the sign up afterwards.  */
  if (value1 < 0)
    {
      negate = !negate;
      uvalue1 = -value1;
    }
  else
    {
      uvalue1 = value1;
    }

  if (value2 < 0)
    {
      negate = !negate;
      uvalue2 = -value2;
    }
  else
    {
      uvalue2 = value2;
    }

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);

  uresult = mul64hi (uvalue1, uvalue2);
  result = uresult;

  if (negate)
    {
      /* Multiply 128-bit result by -1, which means highpart gets inverted,
	 and has carry in added only if low part is 0.  */
      result = ~result;
      if ((uvalue1 * uvalue2) == 0)
	result += 1;
    }

  aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
}

/* Unsigned multiply add long -- source, source2 :
   32 bit, source3 : 64 bit.  */
static void
umaddl (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned ra = INSTR (14, 10);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* N.B. we need to multiply the signed 32 bit values in rn, rm to
     obtain a 64 bit product.  */
  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     aarch64_get_reg_u64 (cpu, ra, NO_SP)
     + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
     * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
}

/* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
static void
umsubl (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned ra = INSTR (14, 10);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  /* N.B. we need to multiply the signed 32 bit values in rn, rm to
     obtain a 64 bit product.  */
  aarch64_set_reg_u64
    (cpu, rd, NO_SP,
     aarch64_get_reg_u64 (cpu, ra, NO_SP)
     - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
     * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
}

/* Unsigned multiply high, source, source2 :
   64 bit, dest <-- high 64-bit of result.  */
static void
umulh (sim_cpu *cpu)
{
  unsigned rm = INSTR (20, 16);
  unsigned rn = INSTR (9, 5);
  unsigned rd = INSTR (4, 0);
  GReg     ra = INSTR (14, 10);

  if (ra != R31)
    HALT_UNALLOC;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rd, NO_SP,
		       mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
				aarch64_get_reg_u64 (cpu, rm, NO_SP)));
}

static void
dexDataProc3Source (sim_cpu *cpu)
{
  /* assert instr[28,24] == 11011.  */
  /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
     instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
     instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
     instr[15] = o0 : 0/1 ==> ok
     instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
                              0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
                              0100 ==> SMULH,                   (64 bit only)
                              1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
                              1100 ==> UMULH                    (64 bit only)
                              ow ==> UNALLOC.  */

  uint32_t dispatch;
  uint32_t size = INSTR (31, 31);
  uint32_t op54 = INSTR (30, 29);
  uint32_t op31 = INSTR (23, 21);
  uint32_t o0 = INSTR (15, 15);

  if (op54 != 0)
    HALT_UNALLOC;

  if (size == 0)
    {
      if (op31 != 0)
	HALT_UNALLOC;

      if (o0 == 0)
	madd32 (cpu);
      else
	msub32 (cpu);
      return;
    }

  dispatch = (op31 << 1) | o0;

  switch (dispatch)
    {
    case 0:  madd64 (cpu); return;
    case 1:  msub64 (cpu); return;
    case 2:  smaddl (cpu); return;
    case 3:  smsubl (cpu); return;
    case 4:  smulh (cpu); return;
    case 10: umaddl (cpu); return;
    case 11: umsubl (cpu); return;
    case 12: umulh (cpu); return;
    default: HALT_UNALLOC;
    }
}

static void
dexDPReg (sim_cpu *cpu)
{
  /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
     assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
     bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
  uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));

  switch (group2)
    {
    case DPREG_LOG_000:
    case DPREG_LOG_001:
      dexLogicalShiftedRegister (cpu); return;

    case DPREG_ADDSHF_010:
      dexAddSubtractShiftedRegister (cpu); return;

    case DPREG_ADDEXT_011:
      dexAddSubtractExtendedRegister (cpu); return;

    case DPREG_ADDCOND_100:
      {
	/* This set bundles a variety of different operations.  */
	/* Check for.  */
	/* 1) add/sub w carry.  */
	uint32_t mask1 = 0x1FE00000U;
	uint32_t val1  = 0x1A000000U;
	/* 2) cond compare register/immediate.  */
	uint32_t mask2 = 0x1FE00000U;
	uint32_t val2  = 0x1A400000U;
	/* 3) cond select.  */
	uint32_t mask3 = 0x1FE00000U;
	uint32_t val3  = 0x1A800000U;
	/* 4) data proc 1/2 source.  */
	uint32_t mask4 = 0x1FE00000U;
	uint32_t val4  = 0x1AC00000U;

	if ((aarch64_get_instr (cpu) & mask1) == val1)
	  dexAddSubtractWithCarry (cpu);

	else if ((aarch64_get_instr (cpu) & mask2) == val2)
	  CondCompare (cpu);

	else if ((aarch64_get_instr (cpu) & mask3) == val3)
	  dexCondSelect (cpu);

	else if ((aarch64_get_instr (cpu) & mask4) == val4)
	  {
	    /* Bit 30 is clear for data proc 2 source
	       and set for data proc 1 source.  */
	    if (aarch64_get_instr (cpu)  & (1U << 30))
	      dexDataProc1Source (cpu);
	    else
	      dexDataProc2Source (cpu);
	  }

	else
	  /* Should not reach here.  */
	  HALT_NYI;

	return;
      }

    case DPREG_3SRC_110:
      dexDataProc3Source (cpu); return;

    case DPREG_UNALLOC_101:
      HALT_UNALLOC;

    case DPREG_3SRC_111:
      dexDataProc3Source (cpu); return;

    default:
      /* Should never reach here.  */
      HALT_NYI;
    }
}

/* Unconditional Branch immediate.
   Offset is a PC-relative byte offset in the range +/- 128MiB.
   The offset is assumed to be raw from the decode i.e. the
   simulator is expected to scale them from word offsets to byte.  */

/* Unconditional branch.  */
static void
buc (sim_cpu *cpu, int32_t offset)
{
  aarch64_set_next_PC_by_offset (cpu, offset);
}

static unsigned stack_depth = 0;

/* Unconditional branch and link -- writes return PC to LR.  */
static void
bl (sim_cpu *cpu, int32_t offset)
{
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_save_LR (cpu);
  aarch64_set_next_PC_by_offset (cpu, offset);

  if (TRACE_BRANCH_P (cpu))
    {
      ++ stack_depth;
      TRACE_BRANCH (cpu,
		    " %*scall %" PRIx64 " [%s]"
		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
		    stack_depth, " ", aarch64_get_next_PC (cpu),
		    aarch64_get_func (CPU_STATE (cpu),
				      aarch64_get_next_PC (cpu)),
		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
		    );
    }
}

/* Unconditional Branch register.
   Branch/return address is in source register.  */

/* Unconditional branch.  */
static void
br (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
}

/* Unconditional branch and link -- writes return PC to LR.  */
static void
blr (sim_cpu *cpu)
{
  /* Ensure we read the destination before we write LR.  */
  uint64_t target = aarch64_get_reg_u64 (cpu, INSTR (9, 5), NO_SP);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_save_LR (cpu);
  aarch64_set_next_PC (cpu, target);

  if (TRACE_BRANCH_P (cpu))
    {
      ++ stack_depth;
      TRACE_BRANCH (cpu,
		    " %*scall %" PRIx64 " [%s]"
		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
		    stack_depth, " ", aarch64_get_next_PC (cpu),
		    aarch64_get_func (CPU_STATE (cpu),
				      aarch64_get_next_PC (cpu)),
		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
		    );
    }
}

/* Return -- assembler will default source to LR this is functionally
   equivalent to br but, presumably, unlike br it side effects the
   branch predictor.  */
static void
ret (sim_cpu *cpu)
{
  unsigned rn = INSTR (9, 5);
  aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (TRACE_BRANCH_P (cpu))
    {
      TRACE_BRANCH (cpu,
		    " %*sreturn [result: %" PRIx64 "]",
		    stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
      -- stack_depth;
    }
}

/* NOP -- we implement this and call it from the decode in case we
   want to intercept it later.  */

static void
nop (sim_cpu *cpu)
{
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
}

/* Data synchronization barrier.  */

static void
dsb (sim_cpu *cpu)
{
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
}

/* Data memory barrier.  */

static void
dmb (sim_cpu *cpu)
{
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
}

/* Instruction synchronization barrier.  */

static void
isb (sim_cpu *cpu)
{
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
}

static void
dexBranchImmediate (sim_cpu *cpu)
{
  /* assert instr[30,26] == 00101
     instr[31] ==> 0 == B, 1 == BL
     instr[25,0] == imm26 branch offset counted in words.  */

  uint32_t top = INSTR (31, 31);
  /* We have a 26 byte signed word offset which we need to pass to the
     execute routine as a signed byte offset.  */
  int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;

  if (top)
    bl (cpu, offset);
  else
    buc (cpu, offset);
}

/* Control Flow.  */

/* Conditional branch

   Offset is a PC-relative byte offset in the range +/- 1MiB pos is
   a bit position in the range 0 .. 63

   cc is a CondCode enum value as pulled out of the decode

   N.B. any offset register (source) can only be Xn or Wn.  */

static void
bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
{
  /* The test returns TRUE if CC is met.  */
  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (testConditionCode (cpu, cc))
    aarch64_set_next_PC_by_offset (cpu, offset);
}

/* 32 bit branch on register non-zero.  */
static void
cbnz32 (sim_cpu *cpu, int32_t offset)
{
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
    aarch64_set_next_PC_by_offset (cpu, offset);
}

/* 64 bit branch on register zero.  */
static void
cbnz (sim_cpu *cpu, int32_t offset)
{
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
    aarch64_set_next_PC_by_offset (cpu, offset);
}

/* 32 bit branch on register non-zero.  */
static void
cbz32 (sim_cpu *cpu, int32_t offset)
{
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
    aarch64_set_next_PC_by_offset (cpu, offset);
}

/* 64 bit branch on register zero.  */
static void
cbz (sim_cpu *cpu, int32_t offset)
{
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
    aarch64_set_next_PC_by_offset (cpu, offset);
}

/* Branch on register bit test non-zero -- one size fits all.  */
static void
tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
{
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
    aarch64_set_next_PC_by_offset (cpu, offset);
}

/* Branch on register bit test zero -- one size fits all.  */
static void
tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
{
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
    aarch64_set_next_PC_by_offset (cpu, offset);
}

static void
dexCompareBranchImmediate (sim_cpu *cpu)
{
  /* instr[30,25] = 01 1010
     instr[31]    = size : 0 ==> 32, 1 ==> 64
     instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
     instr[23,5]  = simm19 branch offset counted in words
     instr[4,0]   = rt  */

  uint32_t size = INSTR (31, 31);
  uint32_t op   = INSTR (24, 24);
  int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;

  if (size == 0)
    {
      if (op == 0)
	cbz32 (cpu, offset);
      else
	cbnz32 (cpu, offset);
    }
  else
    {
      if (op == 0)
	cbz (cpu, offset);
      else
	cbnz (cpu, offset);
    }
}

static void
dexTestBranchImmediate (sim_cpu *cpu)
{
  /* instr[31]    = b5 : bit 5 of test bit idx
     instr[30,25] = 01 1011
     instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
     instr[23,19] = b40 : bits 4 to 0 of test bit idx
     instr[18,5]  = simm14 : signed offset counted in words
     instr[4,0]   = uimm5  */

  uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
  int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;

  NYI_assert (30, 25, 0x1b);

  if (INSTR (24, 24) == 0)
    tbz (cpu, pos, offset);
  else
    tbnz (cpu, pos, offset);
}

static void
dexCondBranchImmediate (sim_cpu *cpu)
{
  /* instr[31,25] = 010 1010
     instr[24]    = op1; op => 00 ==> B.cond
     instr[23,5]  = simm19 : signed offset counted in words
     instr[4]     = op0
     instr[3,0]   = cond  */

  int32_t offset;
  uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));

  NYI_assert (31, 25, 0x2a);

  if (op != 0)
    HALT_UNALLOC;

  offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;

  bcc (cpu, offset, INSTR (3, 0));
}

static void
dexBranchRegister (sim_cpu *cpu)
{
  /* instr[31,25] = 110 1011
     instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
     instr[20,16] = op2 : must be 11111
     instr[15,10] = op3 : must be 000000
     instr[4,0]   = op2 : must be 11111.  */

  uint32_t op = INSTR (24, 21);
  uint32_t op2 = INSTR (20, 16);
  uint32_t op3 = INSTR (15, 10);
  uint32_t op4 = INSTR (4, 0);

  NYI_assert (31, 25, 0x6b);

  if (op2 != 0x1F || op3 != 0 || op4 != 0)
    HALT_UNALLOC;

  if (op == 0)
    br (cpu);

  else if (op == 1)
    blr (cpu);

  else if (op == 2)
    ret (cpu);

  else
    {
      /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
      /* anything else is unallocated.  */
      uint32_t rn = INSTR (4, 0);

      if (rn != 0x1f)
	HALT_UNALLOC;

      if (op == 4 || op == 5)
	HALT_NYI;

      HALT_UNALLOC;
    }
}

/* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
   but this may not be available.  So instead we define the values we need
   here.  */
#define AngelSVC_Reason_Open		0x01
#define AngelSVC_Reason_Close		0x02
#define AngelSVC_Reason_Write		0x05
#define AngelSVC_Reason_Read		0x06
#define AngelSVC_Reason_IsTTY		0x09
#define AngelSVC_Reason_Seek		0x0A
#define AngelSVC_Reason_FLen		0x0C
#define AngelSVC_Reason_Remove		0x0E
#define AngelSVC_Reason_Rename		0x0F
#define AngelSVC_Reason_Clock		0x10
#define AngelSVC_Reason_Time		0x11
#define AngelSVC_Reason_System		0x12
#define AngelSVC_Reason_Errno		0x13
#define AngelSVC_Reason_GetCmdLine	0x15
#define AngelSVC_Reason_HeapInfo	0x16
#define AngelSVC_Reason_ReportException 0x18
#define AngelSVC_Reason_Elapsed         0x30


static void
handle_halt (sim_cpu *cpu, uint32_t val)
{
  uint64_t result = 0;

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  if (val != 0xf000)
    {
      TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
      sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
		       sim_stopped, SIM_SIGTRAP);
    }

  /* We have encountered an Angel SVC call.  See if we can process it.  */
  switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
    {
    case AngelSVC_Reason_HeapInfo:
      {
	/* Get the values.  */
	uint64_t stack_top = aarch64_get_stack_start (cpu);
	uint64_t heap_base = aarch64_get_heap_start (cpu);

	/* Get the pointer  */
	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
	ptr = aarch64_get_mem_u64 (cpu, ptr);

	/* Fill in the memory block.  */
	/* Start addr of heap.  */
	aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
	/* End addr of heap.  */
	aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
	/* Lowest stack addr.  */
	aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
	/* Initial stack addr.  */
	aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);

	TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
      }
      break;

    case AngelSVC_Reason_Open:
      {
	/* Get the pointer  */
	/* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
	/* FIXME: For now we just assume that we will only be asked
	   to open the standard file descriptors.  */
	static int fd = 0;
	result = fd ++;

	TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
      }
      break;

    case AngelSVC_Reason_Close:
      {
	uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
	TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
	result = 0;
      }
      break;

    case AngelSVC_Reason_Errno:
      result = 0;
      TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
      break;

    case AngelSVC_Reason_Clock:
      result =
#ifdef CLOCKS_PER_SEC
	(CLOCKS_PER_SEC >= 100)
	? (clock () / (CLOCKS_PER_SEC / 100))
	: ((clock () * 100) / CLOCKS_PER_SEC)
#else
	/* Presume unix... clock() returns microseconds.  */
	(clock () / 10000)
#endif
	;
	TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
      break;

    case AngelSVC_Reason_GetCmdLine:
      {
	/* Get the pointer  */
	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
	ptr = aarch64_get_mem_u64 (cpu, ptr);

	/* FIXME: No command line for now.  */
	aarch64_set_mem_u64 (cpu, ptr, 0);
	TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
      }
      break;

    case AngelSVC_Reason_IsTTY:
      result = 1;
	TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
      break;

    case AngelSVC_Reason_Write:
      {
	/* Get the pointer  */
	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
	/* Get the write control block.  */
	uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
	uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
	uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);

	TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
		       PRIx64 " on descriptor %" PRIx64,
		       len, buf, fd);

	if (len > 1280)
	  {
	    TRACE_SYSCALL (cpu,
			   " AngelSVC: Write: Suspiciously long write: %ld",
			   (long) len);
	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
			     sim_stopped, SIM_SIGBUS);
	  }
	else if (fd == 1)
	  {
	    printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
	  }
	else if (fd == 2)
	  {
	    TRACE (cpu, 0, "\n");
	    sim_io_eprintf (CPU_STATE (cpu), "%.*s",
			    (int) len, aarch64_get_mem_ptr (cpu, buf));
	    TRACE (cpu, 0, "\n");
	  }
	else
	  {
	    TRACE_SYSCALL (cpu,
			   " AngelSVC: Write: Unexpected file handle: %d",
			   (int) fd);
	    sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
			     sim_stopped, SIM_SIGABRT);
	  }
      }
      break;

    case AngelSVC_Reason_ReportException:
      {
	/* Get the pointer  */
	uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
	/*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
	uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
	uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);

	TRACE_SYSCALL (cpu,
		       "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
		       type, state);

	if (type == 0x20026)
	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
			   sim_exited, state);
	else
	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
			   sim_stopped, SIM_SIGINT);
      }
      break;

    case AngelSVC_Reason_Read:
    case AngelSVC_Reason_FLen:
    case AngelSVC_Reason_Seek:
    case AngelSVC_Reason_Remove:
    case AngelSVC_Reason_Time:
    case AngelSVC_Reason_System:
    case AngelSVC_Reason_Rename:
    case AngelSVC_Reason_Elapsed:
    default:
      TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
		     aarch64_get_reg_u32 (cpu, 0, NO_SP));
      sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
		       sim_stopped, SIM_SIGTRAP);
    }

  aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
}

static void
dexExcpnGen (sim_cpu *cpu)
{
  /* instr[31:24] = 11010100
     instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
                          010 ==> HLT,       101 ==> DBG GEN EXCPN
     instr[20,5]  = imm16
     instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
     instr[1,0]   = LL : discriminates opc  */

  uint32_t opc = INSTR (23, 21);
  uint32_t imm16 = INSTR (20, 5);
  uint32_t opc2 = INSTR (4, 2);
  uint32_t LL;

  NYI_assert (31, 24, 0xd4);

  if (opc2 != 0)
    HALT_UNALLOC;

  LL = INSTR (1, 0);

  /* We only implement HLT and BRK for now.  */
  if (opc == 1 && LL == 0)
    {
      TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
      sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
		       sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
    }

  if (opc == 2 && LL == 0)
    handle_halt (cpu, imm16);

  else if (opc == 0 || opc == 5)
    HALT_NYI;

  else
    HALT_UNALLOC;
}

/* Stub for accessing system registers.  */

static uint64_t
system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
	    unsigned crm, unsigned op2)
{
  if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
    /* DCZID_EL0 - the Data Cache Zero ID register.
       We do not support DC ZVA at the moment, so
       we return a value with the disable bit set.
       We implement support for the DCZID register since
       it is used by the C library's memset function.  */
    return ((uint64_t) 1) << 4;

  if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
    /* Cache Type Register.  */
    return 0x80008000UL;

  if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
    /* TPIDR_EL0 - thread pointer id.  */
    return aarch64_get_thread_id (cpu);

  if (op1 == 3 && crm == 4 && op2 == 0)
    return aarch64_get_FPCR (cpu);

  if (op1 == 3 && crm == 4 && op2 == 1)
    return aarch64_get_FPSR (cpu);

  else if (op1 == 3 && crm == 2 && op2 == 0)
    return aarch64_get_CPSR (cpu);

  HALT_NYI;
}

static void
system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
	    unsigned crm, unsigned op2, uint64_t val)
{
  if (op1 == 3 && crm == 4 && op2 == 0)
    aarch64_set_FPCR (cpu, val);

  else if (op1 == 3 && crm == 4 && op2 == 1)
    aarch64_set_FPSR (cpu, val);

  else if (op1 == 3 && crm == 2 && op2 == 0)
    aarch64_set_CPSR (cpu, val);

  else
    HALT_NYI;
}

static void
do_mrs (sim_cpu *cpu)
{
  /* instr[31:20] = 1101 0101 0001 1
     instr[19]    = op0
     instr[18,16] = op1
     instr[15,12] = CRn
     instr[11,8]  = CRm
     instr[7,5]   = op2
     instr[4,0]   = Rt  */
  unsigned sys_op0 = INSTR (19, 19) + 2;
  unsigned sys_op1 = INSTR (18, 16);
  unsigned sys_crn = INSTR (15, 12);
  unsigned sys_crm = INSTR (11, 8);
  unsigned sys_op2 = INSTR (7, 5);
  unsigned rt = INSTR (4, 0);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  aarch64_set_reg_u64 (cpu, rt, NO_SP,
		       system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
}

static void
do_MSR_immediate (sim_cpu *cpu)
{
  /* instr[31:19] = 1101 0101 0000 0
     instr[18,16] = op1
     instr[15,12] = 0100
     instr[11,8]  = CRm
     instr[7,5]   = op2
     instr[4,0]   = 1 1111  */

  unsigned op1 = INSTR (18, 16);
  /*unsigned crm = INSTR (11, 8);*/
  unsigned op2 = INSTR (7, 5);

  NYI_assert (31, 19, 0x1AA0);
  NYI_assert (15, 12, 0x4);
  NYI_assert (4,  0,  0x1F);

  if (op1 == 0)
    {
      if (op2 == 5)
	HALT_NYI; /* set SPSel.  */
      else
	HALT_UNALLOC;
    }
  else if (op1 == 3)
    {
      if (op2 == 6)
	HALT_NYI; /* set DAIFset.  */
      else if (op2 == 7)
	HALT_NYI; /* set DAIFclr.  */
      else
	HALT_UNALLOC;
    }
  else
    HALT_UNALLOC;
}

static void
do_MSR_reg (sim_cpu *cpu)
{
  /* instr[31:20] = 1101 0101 0001
     instr[19]    = op0
     instr[18,16] = op1
     instr[15,12] = CRn
     instr[11,8]  = CRm
     instr[7,5]   = op2
     instr[4,0]   = Rt  */

  unsigned sys_op0 = INSTR (19, 19) + 2;
  unsigned sys_op1 = INSTR (18, 16);
  unsigned sys_crn = INSTR (15, 12);
  unsigned sys_crm = INSTR (11, 8);
  unsigned sys_op2 = INSTR (7, 5);
  unsigned rt = INSTR (4, 0);

  NYI_assert (31, 20, 0xD51);

  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
  system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
	      aarch64_get_reg_u64 (cpu, rt, NO_SP));
}

static void
do_SYS (sim_cpu *cpu)
{
  /* instr[31,19] = 1101 0101 0000 1
     instr[18,16] = op1
     instr[15,12] = CRn
     instr[11,8]  = CRm
     instr[7,5]   = op2
     instr[4,0]   = Rt  */
  NYI_assert (31, 19, 0x1AA1);

  /* FIXME: For now we just silently accept system ops.  */
}

static void
dexSystem (sim_cpu *cpu)
{
  /* instr[31:22] = 1101 01010 0
     instr[21]    = L
     instr[20,19] = op0
     instr[18,16] = op1
     instr[15,12] = CRn
     instr[11,8]  = CRm
     instr[7,5]   = op2
     instr[4,0]   = uimm5  */

  /* We are interested in HINT, DSB, DMB and ISB

     Hint #0 encodes NOOP (this is the only hint we care about)
     L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
     CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101

     DSB, DMB, ISB are data store barrier, data memory barrier and
     instruction store barrier, respectively, where

     L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
     op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
     CRm<3:2> ==> domain, CRm<1:0> ==> types,
     domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
              10 ==> InerShareable, 11 ==> FullSystem
     types :  01 ==> Reads, 10 ==> Writes,
              11 ==> All, 00 ==> All (domain == FullSystem).  */

  unsigned rt = INSTR (4, 0);

  NYI_assert (31, 22, 0x354);

  switch (INSTR (21, 12))
    {
    case 0x032:
      if (rt == 0x1F)
	{
	  /* NOP has CRm != 0000 OR.  */
	  /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
	  uint32_t crm = INSTR (11, 8);
	  uint32_t op2 = INSTR (7, 5);

	  if (crm != 0 || (op2 == 0 || op2 > 5))
	    {
	      /* Actually call nop method so we can reimplement it later.  */
	      nop (cpu);
	      return;
	    }
	}
      HALT_NYI;

    case 0x033:
      {
	uint32_t op2 =  INSTR (7, 5);

	switch (op2)
	  {
	  case 2: HALT_NYI;
	  case 4: dsb (cpu); return;
	  case 5: dmb (cpu); return;
	  case 6: isb (cpu); return;
	  default: HALT_UNALLOC;
	}
      }

    case 0x3B0:
    case 0x3B4:
    case 0x3BD:
      do_mrs (cpu);
      return;

    case 0x0B7:
      do_SYS (cpu); /* DC is an alias of SYS.  */
      return;

    default:
      if (INSTR (21, 20) == 0x1)
	do_MSR_reg (cpu);
      else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
	do_MSR_immediate (cpu);
      else
	HALT_NYI;
      return;
    }
}

static void
dexBr (sim_cpu *cpu)
{
  /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
     assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
     bits [31,29] of a BrExSys are the secondary dispatch vector.  */
  uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));

  switch (group2)
    {
    case BR_IMM_000:
      return dexBranchImmediate (cpu);

    case BR_IMMCMP_001:
      /* Compare has bit 25 clear while test has it set.  */
      if (!INSTR (25, 25))
	dexCompareBranchImmediate (cpu);
      else
	dexTestBranchImmediate (cpu);
      return;

    case BR_IMMCOND_010:
      /* This is a conditional branch if bit 25 is clear otherwise
         unallocated.  */
      if (!INSTR (25, 25))
	dexCondBranchImmediate (cpu);
      else
	HALT_UNALLOC;
      return;

    case BR_UNALLOC_011:
      HALT_UNALLOC;

    case BR_IMM_100:
      dexBranchImmediate (cpu);
      return;

    case BR_IMMCMP_101:
      /* Compare has bit 25 clear while test has it set.  */
      if (!INSTR (25, 25))
	dexCompareBranchImmediate (cpu);
      else
	dexTestBranchImmediate (cpu);
      return;

    case BR_REG_110:
      /* Unconditional branch reg has bit 25 set.  */
      if (INSTR (25, 25))
	dexBranchRegister (cpu);

      /* This includes both Excpn Gen, System and unalloc operations.
         We need to decode the Excpn Gen operation BRK so we can plant
         debugger entry points.
         Excpn Gen operations have instr [24] = 0.
         we need to decode at least one of the System operations NOP
         which is an alias for HINT #0.
         System operations have instr [24,22] = 100.  */
      else if (INSTR (24, 24) == 0)
	dexExcpnGen (cpu);

      else if (INSTR (24, 22) == 4)
	dexSystem (cpu);

      else
	HALT_UNALLOC;

      return;

    case BR_UNALLOC_111:
      HALT_UNALLOC;

    default:
      /* Should never reach here.  */
      HALT_NYI;
    }
}

static void
aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
{
  /* We need to check if gdb wants an in here.  */
  /* checkBreak (cpu);.  */

  uint64_t group = dispatchGroup (aarch64_get_instr (cpu));

  switch (group)
    {
    case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
    case GROUP_LDST_0100:     dexLdSt (cpu); break;
    case GROUP_DPREG_0101:    dexDPReg (cpu); break;
    case GROUP_LDST_0110:     dexLdSt (cpu); break;
    case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
    case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
    case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
    case GROUP_BREXSYS_1010:  dexBr (cpu); break;
    case GROUP_BREXSYS_1011:  dexBr (cpu); break;
    case GROUP_LDST_1100:     dexLdSt (cpu); break;
    case GROUP_DPREG_1101:    dexDPReg (cpu); break;
    case GROUP_LDST_1110:     dexLdSt (cpu); break;
    case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;

    case GROUP_UNALLOC_0001:
    case GROUP_UNALLOC_0010:
    case GROUP_UNALLOC_0011:
      HALT_UNALLOC;

    default:
      /* Should never reach here.  */
      HALT_NYI;
    }
}

static bfd_boolean
aarch64_step (sim_cpu *cpu)
{
  uint64_t pc = aarch64_get_PC (cpu);

  if (pc == TOP_LEVEL_RETURN_PC)
    return FALSE;

  aarch64_set_next_PC (cpu, pc + 4);

  /* Code is always little-endian.  */
  sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
			& aarch64_get_instr (cpu), pc, 4);
  aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));

  TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
	      aarch64_get_instr (cpu));
  TRACE_DISASM (cpu, pc);

  aarch64_decode_and_execute (cpu, pc);

  return TRUE;
}

void
aarch64_run (SIM_DESC sd)
{
  sim_cpu *cpu = STATE_CPU (sd, 0);

  while (aarch64_step (cpu))
    {
      aarch64_update_PC (cpu);

      if (sim_events_tick (sd))
	sim_events_process (sd);
    }

  sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
		   sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
}

void
aarch64_init (sim_cpu *cpu, uint64_t pc)
{
  uint64_t sp = aarch64_get_stack_start (cpu);

  /* Install SP, FP and PC and set LR to -20
     so we can detect a top-level return.  */
  aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
  aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
  aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
  aarch64_set_next_PC (cpu, pc);
  aarch64_update_PC (cpu);
  aarch64_init_LIT_table ();
}