Home | History | Annotate | Line # | Download | only in sljit_src
      1 /*	$NetBSD: sljitNativeTILEGX_64.c,v 1.4 2019/01/20 23:14:16 alnsn Exp $	*/
      2 
      3 /*
      4  *    Stack-less Just-In-Time compiler
      5  *
      6  *    Copyright 2013-2013 Tilera Corporation(jiwang (at) tilera.com). All rights reserved.
      7  *    Copyright Zoltan Herczeg (hzmester (at) freemail.hu). All rights reserved.
      8  *
      9  * Redistribution and use in source and binary forms, with or without modification, are
     10  * permitted provided that the following conditions are met:
     11  *
     12  *   1. Redistributions of source code must retain the above copyright notice, this list of
     13  *      conditions and the following disclaimer.
     14  *
     15  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
     16  *      of conditions and the following disclaimer in the documentation and/or other materials
     17  *      provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
     20  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
     22  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     24  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
     25  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     27  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28  */
     29 
     30 /* TileGX architecture. */
     31 /* Contributed by Tilera Corporation. */
     32 #include "sljitNativeTILEGX-encoder.c"
     33 
     34 #define SIMM_8BIT_MAX (0x7f)
     35 #define SIMM_8BIT_MIN (-0x80)
     36 #define SIMM_16BIT_MAX (0x7fff)
     37 #define SIMM_16BIT_MIN (-0x8000)
     38 #define SIMM_17BIT_MAX (0xffff)
     39 #define SIMM_17BIT_MIN (-0x10000)
     40 #define SIMM_32BIT_MAX (0x7fffffff)
     41 #define SIMM_32BIT_MIN (-0x7fffffff - 1)
     42 #define SIMM_48BIT_MAX (0x7fffffff0000L)
     43 #define SIMM_48BIT_MIN (-0x800000000000L)
     44 #define IMM16(imm) ((imm) & 0xffff)
     45 
     46 #define UIMM_16BIT_MAX (0xffff)
     47 
     48 #define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
     49 #define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
     50 #define TMP_REG3 (SLJIT_NUMBER_OF_REGISTERS + 4)
     51 #define ADDR_TMP (SLJIT_NUMBER_OF_REGISTERS + 5)
     52 #define PIC_ADDR_REG TMP_REG2
     53 
     54 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 6] = {
     55 	63, 0, 1, 2, 3, 4, 30, 31, 32, 33, 34, 54, 5, 16, 6, 7
     56 };
     57 
     58 #define SLJIT_LOCALS_REG_mapped 54
     59 #define TMP_REG1_mapped 5
     60 #define TMP_REG2_mapped 16
     61 #define TMP_REG3_mapped 6
     62 #define ADDR_TMP_mapped 7
     63 
     64 /* Flags are keept in volatile registers. */
     65 #define EQUAL_FLAG 8
     66 /* And carry flag as well. */
     67 #define ULESS_FLAG 9
     68 #define UGREATER_FLAG 10
     69 #define LESS_FLAG 11
     70 #define GREATER_FLAG 12
     71 #define OVERFLOW_FLAG 13
     72 
     73 #define ZERO 63
     74 #define RA 55
     75 #define TMP_EREG1 14
     76 #define TMP_EREG2 15
     77 
     78 #define LOAD_DATA 0x01
     79 #define WORD_DATA 0x00
     80 #define BYTE_DATA 0x02
     81 #define HALF_DATA 0x04
     82 #define INT_DATA 0x06
     83 #define SIGNED_DATA 0x08
     84 #define DOUBLE_DATA 0x10
     85 
     86 /* Separates integer and floating point registers */
     87 #define GPR_REG 0xf
     88 
     89 #define MEM_MASK 0x1f
     90 
     91 #define WRITE_BACK 0x00020
     92 #define ARG_TEST 0x00040
     93 #define ALT_KEEP_CACHE 0x00080
     94 #define CUMULATIVE_OP 0x00100
     95 #define LOGICAL_OP 0x00200
     96 #define IMM_OP 0x00400
     97 #define SRC2_IMM 0x00800
     98 
     99 #define UNUSED_DEST 0x01000
    100 #define REG_DEST 0x02000
    101 #define REG1_SOURCE 0x04000
    102 #define REG2_SOURCE 0x08000
    103 #define SLOW_SRC1 0x10000
    104 #define SLOW_SRC2 0x20000
    105 #define SLOW_DEST 0x40000
    106 
    107 /* Only these flags are set. UNUSED_DEST is not set when no flags should be set.
    108  */
    109 #define CHECK_FLAGS(list) (!(flags & UNUSED_DEST) || (op & GET_FLAGS(~(list))))
    110 
    111 SLJIT_API_FUNC_ATTRIBUTE const char *sljit_get_platform_name(void)
    112 {
    113 	return "TileGX" SLJIT_CPUINFO;
    114 }
    115 
    116 /* Length of an instruction word */
    117 typedef sljit_uw sljit_ins;
    118 
    119 struct jit_instr {
    120 	const struct tilegx_opcode* opcode;
    121 	tilegx_pipeline pipe;
    122 	unsigned long input_registers;
    123 	unsigned long output_registers;
    124 	int operand_value[4];
    125 	int line;
    126 };
    127 
    128 /* Opcode Helper Macros */
    129 #define TILEGX_X_MODE 0
    130 
    131 #define X_MODE create_Mode(TILEGX_X_MODE)
    132 
    133 #define FNOP_X0 \
    134 	create_Opcode_X0(RRR_0_OPCODE_X0) | \
    135 	create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) | \
    136 	create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0)
    137 
    138 #define FNOP_X1 \
    139 	create_Opcode_X1(RRR_0_OPCODE_X1) | \
    140 	create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) | \
    141 	create_UnaryOpcodeExtension_X1(FNOP_UNARY_OPCODE_X1)
    142 
    143 #define NOP \
    144 	create_Mode(TILEGX_X_MODE) | FNOP_X0 | FNOP_X1
    145 
    146 #define ANOP_X0 \
    147 	create_Opcode_X0(RRR_0_OPCODE_X0) | \
    148 	create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) | \
    149 	create_UnaryOpcodeExtension_X0(NOP_UNARY_OPCODE_X0)
    150 
    151 #define BPT create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    152 	create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) | \
    153 	create_UnaryOpcodeExtension_X1(ILL_UNARY_OPCODE_X1) | \
    154 	create_Dest_X1(0x1C) | create_SrcA_X1(0x25) | ANOP_X0
    155 
    156 #define ADD_X1 \
    157 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    158 	create_RRROpcodeExtension_X1(ADD_RRR_0_OPCODE_X1) | FNOP_X0
    159 
    160 #define ADDI_X1 \
    161 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(IMM8_OPCODE_X1) | \
    162 	create_Imm8OpcodeExtension_X1(ADDI_IMM8_OPCODE_X1) | FNOP_X0
    163 
    164 #define SUB_X1 \
    165 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    166 	create_RRROpcodeExtension_X1(SUB_RRR_0_OPCODE_X1) | FNOP_X0
    167 
    168 #define NOR_X1 \
    169 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    170 	create_RRROpcodeExtension_X1(NOR_RRR_0_OPCODE_X1) | FNOP_X0
    171 
    172 #define OR_X1 \
    173 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    174 	create_RRROpcodeExtension_X1(OR_RRR_0_OPCODE_X1) | FNOP_X0
    175 
    176 #define AND_X1 \
    177 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    178 	create_RRROpcodeExtension_X1(AND_RRR_0_OPCODE_X1) | FNOP_X0
    179 
    180 #define XOR_X1 \
    181 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    182 	create_RRROpcodeExtension_X1(XOR_RRR_0_OPCODE_X1) | FNOP_X0
    183 
    184 #define CMOVNEZ_X0 \
    185 	create_Mode(TILEGX_X_MODE) | create_Opcode_X0(RRR_0_OPCODE_X0) | \
    186 	create_RRROpcodeExtension_X0(CMOVNEZ_RRR_0_OPCODE_X0) | FNOP_X1
    187 
    188 #define CMOVEQZ_X0 \
    189 	create_Mode(TILEGX_X_MODE) | create_Opcode_X0(RRR_0_OPCODE_X0) | \
    190 	create_RRROpcodeExtension_X0(CMOVEQZ_RRR_0_OPCODE_X0) | FNOP_X1
    191 
    192 #define ADDLI_X1 \
    193 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(ADDLI_OPCODE_X1) | FNOP_X0
    194 
    195 #define V4INT_L_X1 \
    196 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    197 	create_RRROpcodeExtension_X1(V4INT_L_RRR_0_OPCODE_X1) | FNOP_X0
    198 
    199 #define BFEXTU_X0 \
    200 	create_Mode(TILEGX_X_MODE) | create_Opcode_X0(BF_OPCODE_X0) | \
    201 	create_BFOpcodeExtension_X0(BFEXTU_BF_OPCODE_X0) | FNOP_X1
    202 
    203 #define BFEXTS_X0 \
    204 	create_Mode(TILEGX_X_MODE) | create_Opcode_X0(BF_OPCODE_X0) | \
    205 	create_BFOpcodeExtension_X0(BFEXTS_BF_OPCODE_X0) | FNOP_X1
    206 
    207 #define SHL16INSLI_X1 \
    208 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(SHL16INSLI_OPCODE_X1) | FNOP_X0
    209 
    210 #define ST_X1 \
    211 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    212 	create_RRROpcodeExtension_X1(ST_RRR_0_OPCODE_X1) | create_Dest_X1(0x0) | FNOP_X0
    213 
    214 #define LD_X1 \
    215 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    216 	create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) | \
    217 	create_UnaryOpcodeExtension_X1(LD_UNARY_OPCODE_X1) | FNOP_X0
    218 
    219 #define JR_X1 \
    220 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    221 	create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) | \
    222 	create_UnaryOpcodeExtension_X1(JR_UNARY_OPCODE_X1) | FNOP_X0
    223 
    224 #define JALR_X1 \
    225 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    226 	create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) | \
    227 	create_UnaryOpcodeExtension_X1(JALR_UNARY_OPCODE_X1) | FNOP_X0
    228 
    229 #define CLZ_X0 \
    230 	create_Mode(TILEGX_X_MODE) | create_Opcode_X0(RRR_0_OPCODE_X0) | \
    231 	create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) | \
    232 	create_UnaryOpcodeExtension_X0(CNTLZ_UNARY_OPCODE_X0) | FNOP_X1
    233 
    234 #define CMPLTUI_X1 \
    235 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(IMM8_OPCODE_X1) | \
    236 	create_Imm8OpcodeExtension_X1(CMPLTUI_IMM8_OPCODE_X1) | FNOP_X0
    237 
    238 #define CMPLTU_X1 \
    239 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    240 	create_RRROpcodeExtension_X1(CMPLTU_RRR_0_OPCODE_X1) | FNOP_X0
    241 
    242 #define CMPLTS_X1 \
    243 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    244 	create_RRROpcodeExtension_X1(CMPLTS_RRR_0_OPCODE_X1) | FNOP_X0
    245 
    246 #define XORI_X1 \
    247 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(IMM8_OPCODE_X1) | \
    248 	create_Imm8OpcodeExtension_X1(XORI_IMM8_OPCODE_X1) | FNOP_X0
    249 
    250 #define ORI_X1 \
    251 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(IMM8_OPCODE_X1) | \
    252 	create_Imm8OpcodeExtension_X1(ORI_IMM8_OPCODE_X1) | FNOP_X0
    253 
    254 #define ANDI_X1 \
    255 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(IMM8_OPCODE_X1) | \
    256 	create_Imm8OpcodeExtension_X1(ANDI_IMM8_OPCODE_X1) | FNOP_X0
    257 
    258 #define SHLI_X1 \
    259 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(SHIFT_OPCODE_X1) | \
    260 	create_ShiftOpcodeExtension_X1(SHLI_SHIFT_OPCODE_X1) | FNOP_X0
    261 
    262 #define SHL_X1 \
    263 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    264 	create_RRROpcodeExtension_X1(SHL_RRR_0_OPCODE_X1) | FNOP_X0
    265 
    266 #define SHRSI_X1 \
    267 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(SHIFT_OPCODE_X1) | \
    268 	create_ShiftOpcodeExtension_X1(SHRSI_SHIFT_OPCODE_X1) | FNOP_X0
    269 
    270 #define SHRS_X1 \
    271 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    272 	create_RRROpcodeExtension_X1(SHRS_RRR_0_OPCODE_X1) | FNOP_X0
    273 
    274 #define SHRUI_X1 \
    275 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(SHIFT_OPCODE_X1) | \
    276 	create_ShiftOpcodeExtension_X1(SHRUI_SHIFT_OPCODE_X1) | FNOP_X0
    277 
    278 #define SHRU_X1 \
    279 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(RRR_0_OPCODE_X1) | \
    280 	create_RRROpcodeExtension_X1(SHRU_RRR_0_OPCODE_X1) | FNOP_X0
    281 
    282 #define BEQZ_X1 \
    283 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(BRANCH_OPCODE_X1) | \
    284 	create_BrType_X1(BEQZ_BRANCH_OPCODE_X1) | FNOP_X0
    285 
    286 #define BNEZ_X1 \
    287 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(BRANCH_OPCODE_X1) | \
    288 	create_BrType_X1(BNEZ_BRANCH_OPCODE_X1) | FNOP_X0
    289 
    290 #define J_X1 \
    291 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(JUMP_OPCODE_X1) | \
    292 	create_JumpOpcodeExtension_X1(J_JUMP_OPCODE_X1) | FNOP_X0
    293 
    294 #define JAL_X1 \
    295 	create_Mode(TILEGX_X_MODE) | create_Opcode_X1(JUMP_OPCODE_X1) | \
    296 	create_JumpOpcodeExtension_X1(JAL_JUMP_OPCODE_X1) | FNOP_X0
    297 
    298 #define DEST_X0(x) create_Dest_X0(x)
    299 #define SRCA_X0(x) create_SrcA_X0(x)
    300 #define SRCB_X0(x) create_SrcB_X0(x)
    301 #define DEST_X1(x) create_Dest_X1(x)
    302 #define SRCA_X1(x) create_SrcA_X1(x)
    303 #define SRCB_X1(x) create_SrcB_X1(x)
    304 #define IMM16_X1(x) create_Imm16_X1(x)
    305 #define IMM8_X1(x) create_Imm8_X1(x)
    306 #define BFSTART_X0(x) create_BFStart_X0(x)
    307 #define BFEND_X0(x) create_BFEnd_X0(x)
    308 #define SHIFTIMM_X1(x) create_ShAmt_X1(x)
    309 #define JOFF_X1(x) create_JumpOff_X1(x)
    310 #define BOFF_X1(x) create_BrOff_X1(x)
    311 
    312 static const tilegx_mnemonic data_transfer_insts[16] = {
    313 	/* u w s */ TILEGX_OPC_ST   /* st */,
    314 	/* u w l */ TILEGX_OPC_LD   /* ld */,
    315 	/* u b s */ TILEGX_OPC_ST1  /* st1 */,
    316 	/* u b l */ TILEGX_OPC_LD1U /* ld1u */,
    317 	/* u h s */ TILEGX_OPC_ST2  /* st2 */,
    318 	/* u h l */ TILEGX_OPC_LD2U /* ld2u */,
    319 	/* u i s */ TILEGX_OPC_ST4  /* st4 */,
    320 	/* u i l */ TILEGX_OPC_LD4U /* ld4u */,
    321 	/* s w s */ TILEGX_OPC_ST   /* st */,
    322 	/* s w l */ TILEGX_OPC_LD   /* ld */,
    323 	/* s b s */ TILEGX_OPC_ST1  /* st1 */,
    324 	/* s b l */ TILEGX_OPC_LD1S /* ld1s */,
    325 	/* s h s */ TILEGX_OPC_ST2  /* st2 */,
    326 	/* s h l */ TILEGX_OPC_LD2S /* ld2s */,
    327 	/* s i s */ TILEGX_OPC_ST4  /* st4 */,
    328 	/* s i l */ TILEGX_OPC_LD4S /* ld4s */,
    329 };
    330 
    331 #ifdef TILEGX_JIT_DEBUG
    332 static sljit_s32 push_inst_debug(struct sljit_compiler *compiler, sljit_ins ins, int line)
    333 {
    334 	sljit_ins *ptr = (sljit_ins *)ensure_buf(compiler, sizeof(sljit_ins));
    335 	FAIL_IF(!ptr);
    336 	*ptr = ins;
    337 	compiler->size++;
    338 	printf("|%04d|S0|:\t\t", line);
    339 	print_insn_tilegx(ptr);
    340 	return SLJIT_SUCCESS;
    341 }
    342 
    343 static sljit_s32 push_inst_nodebug(struct sljit_compiler *compiler, sljit_ins ins)
    344 {
    345 	sljit_ins *ptr = (sljit_ins *)ensure_buf(compiler, sizeof(sljit_ins));
    346 	FAIL_IF(!ptr);
    347 	*ptr = ins;
    348 	compiler->size++;
    349 	return SLJIT_SUCCESS;
    350 }
    351 
    352 #define push_inst(a, b) push_inst_debug(a, b, __LINE__)
    353 #else
    354 static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins)
    355 {
    356 	sljit_ins *ptr = (sljit_ins *)ensure_buf(compiler, sizeof(sljit_ins));
    357 	FAIL_IF(!ptr);
    358 	*ptr = ins;
    359 	compiler->size++;
    360 	return SLJIT_SUCCESS;
    361 }
    362 #endif
    363 
    364 #define BUNDLE_FORMAT_MASK(p0, p1, p2) \
    365 	((p0) | ((p1) << 8) | ((p2) << 16))
    366 
    367 #define BUNDLE_FORMAT(p0, p1, p2) \
    368 	{ \
    369 		{ \
    370 			(tilegx_pipeline)(p0), \
    371 			(tilegx_pipeline)(p1), \
    372 			(tilegx_pipeline)(p2) \
    373 		}, \
    374 		BUNDLE_FORMAT_MASK(1 << (p0), 1 << (p1), (1 << (p2))) \
    375 	}
    376 
    377 #define NO_PIPELINE TILEGX_NUM_PIPELINE_ENCODINGS
    378 
    379 #define tilegx_is_x_pipeline(p) ((int)(p) <= (int)TILEGX_PIPELINE_X1)
    380 
    381 #define PI(encoding) \
    382 	push_inst(compiler, encoding)
    383 
    384 #define PB3(opcode, dst, srca, srcb) \
    385 	push_3_buffer(compiler, opcode, dst, srca, srcb, __LINE__)
    386 
    387 #define PB2(opcode, dst, src) \
    388 	push_2_buffer(compiler, opcode, dst, src, __LINE__)
    389 
    390 #define JR(reg) \
    391 	push_jr_buffer(compiler, TILEGX_OPC_JR, reg, __LINE__)
    392 
    393 #define ADD(dst, srca, srcb) \
    394 	push_3_buffer(compiler, TILEGX_OPC_ADD, dst, srca, srcb, __LINE__)
    395 
    396 #define SUB(dst, srca, srcb) \
    397 	push_3_buffer(compiler, TILEGX_OPC_SUB, dst, srca, srcb, __LINE__)
    398 
    399 #define MUL(dst, srca, srcb) \
    400 	push_3_buffer(compiler, TILEGX_OPC_MULX, dst, srca, srcb, __LINE__)
    401 
    402 #define NOR(dst, srca, srcb) \
    403 	push_3_buffer(compiler, TILEGX_OPC_NOR, dst, srca, srcb, __LINE__)
    404 
    405 #define OR(dst, srca, srcb) \
    406 	push_3_buffer(compiler, TILEGX_OPC_OR, dst, srca, srcb, __LINE__)
    407 
    408 #define XOR(dst, srca, srcb) \
    409 	push_3_buffer(compiler, TILEGX_OPC_XOR, dst, srca, srcb, __LINE__)
    410 
    411 #define AND(dst, srca, srcb) \
    412 	push_3_buffer(compiler, TILEGX_OPC_AND, dst, srca, srcb, __LINE__)
    413 
    414 #define CLZ(dst, src) \
    415 	push_2_buffer(compiler, TILEGX_OPC_CLZ, dst, src, __LINE__)
    416 
    417 #define SHLI(dst, srca, srcb) \
    418 	push_3_buffer(compiler, TILEGX_OPC_SHLI, dst, srca, srcb, __LINE__)
    419 
    420 #define SHRUI(dst, srca, imm) \
    421 	push_3_buffer(compiler, TILEGX_OPC_SHRUI, dst, srca, imm, __LINE__)
    422 
    423 #define XORI(dst, srca, imm) \
    424 	push_3_buffer(compiler, TILEGX_OPC_XORI, dst, srca, imm, __LINE__)
    425 
    426 #define ORI(dst, srca, imm) \
    427 	push_3_buffer(compiler, TILEGX_OPC_ORI, dst, srca, imm, __LINE__)
    428 
    429 #define CMPLTU(dst, srca, srcb) \
    430 	push_3_buffer(compiler, TILEGX_OPC_CMPLTU, dst, srca, srcb, __LINE__)
    431 
    432 #define CMPLTS(dst, srca, srcb) \
    433 	push_3_buffer(compiler, TILEGX_OPC_CMPLTS, dst, srca, srcb, __LINE__)
    434 
    435 #define CMPLTUI(dst, srca, imm) \
    436 	push_3_buffer(compiler, TILEGX_OPC_CMPLTUI, dst, srca, imm, __LINE__)
    437 
    438 #define CMOVNEZ(dst, srca, srcb) \
    439 	push_3_buffer(compiler, TILEGX_OPC_CMOVNEZ, dst, srca, srcb, __LINE__)
    440 
    441 #define CMOVEQZ(dst, srca, srcb) \
    442 	push_3_buffer(compiler, TILEGX_OPC_CMOVEQZ, dst, srca, srcb, __LINE__)
    443 
    444 #define ADDLI(dst, srca, srcb) \
    445 	push_3_buffer(compiler, TILEGX_OPC_ADDLI, dst, srca, srcb, __LINE__)
    446 
    447 #define SHL16INSLI(dst, srca, srcb) \
    448 	push_3_buffer(compiler, TILEGX_OPC_SHL16INSLI, dst, srca, srcb, __LINE__)
    449 
    450 #define LD_ADD(dst, addr, adjust) \
    451 	push_3_buffer(compiler, TILEGX_OPC_LD_ADD, dst, addr, adjust, __LINE__)
    452 
    453 #define ST_ADD(src, addr, adjust) \
    454 	push_3_buffer(compiler, TILEGX_OPC_ST_ADD, src, addr, adjust, __LINE__)
    455 
    456 #define LD(dst, addr) \
    457 	push_2_buffer(compiler, TILEGX_OPC_LD, dst, addr, __LINE__)
    458 
    459 #define BFEXTU(dst, src, start, end) \
    460 	push_4_buffer(compiler, TILEGX_OPC_BFEXTU, dst, src, start, end, __LINE__)
    461 
    462 #define BFEXTS(dst, src, start, end) \
    463 	push_4_buffer(compiler, TILEGX_OPC_BFEXTS, dst, src, start, end, __LINE__)
    464 
    465 #define ADD_SOLO(dest, srca, srcb) \
    466 	push_inst(compiler, ADD_X1 | DEST_X1(dest) | SRCA_X1(srca) | SRCB_X1(srcb))
    467 
    468 #define ADDI_SOLO(dest, srca, imm) \
    469 	push_inst(compiler, ADDI_X1 | DEST_X1(dest) | SRCA_X1(srca) | IMM8_X1(imm))
    470 
    471 #define ADDLI_SOLO(dest, srca, imm) \
    472 	push_inst(compiler, ADDLI_X1 | DEST_X1(dest) | SRCA_X1(srca) | IMM16_X1(imm))
    473 
    474 #define SHL16INSLI_SOLO(dest, srca, imm) \
    475 	push_inst(compiler, SHL16INSLI_X1 | DEST_X1(dest) | SRCA_X1(srca) | IMM16_X1(imm))
    476 
    477 #define JALR_SOLO(reg) \
    478 	push_inst(compiler, JALR_X1 | SRCA_X1(reg))
    479 
    480 #define JR_SOLO(reg) \
    481 	push_inst(compiler, JR_X1 | SRCA_X1(reg))
    482 
    483 struct Format {
    484 	/* Mapping of bundle issue slot to assigned pipe. */
    485 	tilegx_pipeline pipe[TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE];
    486 
    487 	/* Mask of pipes used by this bundle. */
    488 	unsigned int pipe_mask;
    489 };
    490 
    491 const struct Format formats[] =
    492 {
    493 	/* In Y format we must always have something in Y2, since it has
    494 	* no fnop, so this conveys that Y2 must always be used. */
    495 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y0, TILEGX_PIPELINE_Y2, NO_PIPELINE),
    496 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y1, TILEGX_PIPELINE_Y2, NO_PIPELINE),
    497 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y2, TILEGX_PIPELINE_Y0, NO_PIPELINE),
    498 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y2, TILEGX_PIPELINE_Y1, NO_PIPELINE),
    499 
    500 	/* Y format has three instructions. */
    501 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y0, TILEGX_PIPELINE_Y1, TILEGX_PIPELINE_Y2),
    502 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y0, TILEGX_PIPELINE_Y2, TILEGX_PIPELINE_Y1),
    503 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y1, TILEGX_PIPELINE_Y0, TILEGX_PIPELINE_Y2),
    504 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y1, TILEGX_PIPELINE_Y2, TILEGX_PIPELINE_Y0),
    505 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y2, TILEGX_PIPELINE_Y0, TILEGX_PIPELINE_Y1),
    506 	BUNDLE_FORMAT(TILEGX_PIPELINE_Y2, TILEGX_PIPELINE_Y1, TILEGX_PIPELINE_Y0),
    507 
    508 	/* X format has only two instructions. */
    509 	BUNDLE_FORMAT(TILEGX_PIPELINE_X0, TILEGX_PIPELINE_X1, NO_PIPELINE),
    510 	BUNDLE_FORMAT(TILEGX_PIPELINE_X1, TILEGX_PIPELINE_X0, NO_PIPELINE)
    511 };
    512 
    513 
    514 struct jit_instr inst_buf[TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE];
    515 unsigned long inst_buf_index;
    516 
    517 tilegx_pipeline get_any_valid_pipe(const struct tilegx_opcode* opcode)
    518 {
    519 	/* FIXME: tile: we could pregenerate this. */
    520 	int pipe;
    521 	for (pipe = 0; ((opcode->pipes & (1 << pipe)) == 0 && pipe < TILEGX_NUM_PIPELINE_ENCODINGS); pipe++)
    522 		;
    523 	return (tilegx_pipeline)(pipe);
    524 }
    525 
    526 void insert_nop(tilegx_mnemonic opc, int line)
    527 {
    528 	const struct tilegx_opcode* opcode = NULL;
    529 
    530 	memmove(&inst_buf[1], &inst_buf[0], inst_buf_index * sizeof inst_buf[0]);
    531 
    532 	opcode = &tilegx_opcodes[opc];
    533 	inst_buf[0].opcode = opcode;
    534 	inst_buf[0].pipe = get_any_valid_pipe(opcode);
    535 	inst_buf[0].input_registers = 0;
    536 	inst_buf[0].output_registers = 0;
    537 	inst_buf[0].line = line;
    538 	++inst_buf_index;
    539 }
    540 
    541 const struct Format* compute_format()
    542 {
    543 	unsigned int compatible_pipes = BUNDLE_FORMAT_MASK(
    544 		inst_buf[0].opcode->pipes,
    545 		inst_buf[1].opcode->pipes,
    546 		(inst_buf_index == 3 ? inst_buf[2].opcode->pipes : (1 << NO_PIPELINE)));
    547 
    548 	const struct Format* match = NULL;
    549 	const struct Format *b = NULL;
    550 	unsigned int i;
    551 	for (i = 0; i < sizeof formats / sizeof formats[0]; i++) {
    552 		b = &formats[i];
    553 		if ((b->pipe_mask & compatible_pipes) == b->pipe_mask) {
    554 			match = b;
    555 			break;
    556 		}
    557 	}
    558 
    559 	return match;
    560 }
    561 
    562 sljit_s32 assign_pipes()
    563 {
    564 	unsigned long output_registers = 0;
    565 	unsigned int i = 0;
    566 
    567 	if (inst_buf_index == 1) {
    568 		tilegx_mnemonic opc = inst_buf[0].opcode->can_bundle
    569 					? TILEGX_OPC_FNOP : TILEGX_OPC_NOP;
    570 		insert_nop(opc, __LINE__);
    571 	}
    572 
    573 	const struct Format* match = compute_format();
    574 
    575 	if (match == NULL)
    576 		return -1;
    577 
    578 	for (i = 0; i < inst_buf_index; i++) {
    579 
    580 		if ((i > 0) && ((inst_buf[i].input_registers & output_registers) != 0))
    581 			return -1;
    582 
    583 		if ((i > 0) && ((inst_buf[i].output_registers & output_registers) != 0))
    584 			return -1;
    585 
    586 		/* Don't include Rzero in the match set, to avoid triggering
    587 		   needlessly on 'prefetch' instrs. */
    588 
    589 		output_registers |= inst_buf[i].output_registers & 0xFFFFFFFFFFFFFFL;
    590 
    591 		inst_buf[i].pipe = match->pipe[i];
    592 	}
    593 
    594 	/* If only 2 instrs, and in Y-mode, insert a nop. */
    595 	if (inst_buf_index == 2 && !tilegx_is_x_pipeline(match->pipe[0])) {
    596 		insert_nop(TILEGX_OPC_FNOP, __LINE__);
    597 
    598 		/* Select the yet unassigned pipe. */
    599 		tilegx_pipeline pipe = (tilegx_pipeline)(((TILEGX_PIPELINE_Y0
    600 					+ TILEGX_PIPELINE_Y1 + TILEGX_PIPELINE_Y2)
    601 					- (inst_buf[1].pipe + inst_buf[2].pipe)));
    602 
    603 		inst_buf[0].pipe = pipe;
    604 	}
    605 
    606 	return 0;
    607 }
    608 
    609 tilegx_bundle_bits get_bundle_bit(struct jit_instr *inst)
    610 {
    611 	int i, val;
    612 	const struct tilegx_opcode* opcode = inst->opcode;
    613 	tilegx_bundle_bits bits = opcode->fixed_bit_values[inst->pipe];
    614 
    615 	const struct tilegx_operand* operand = NULL;
    616 	for (i = 0; i < opcode->num_operands; i++) {
    617 		operand = &tilegx_operands[opcode->operands[inst->pipe][i]];
    618 		val = inst->operand_value[i];
    619 
    620 		bits |= operand->insert(val);
    621 	}
    622 
    623 	return bits;
    624 }
    625 
    626 static sljit_s32 update_buffer(struct sljit_compiler *compiler)
    627 {
    628 	int i;
    629 	int orig_index = inst_buf_index;
    630 	struct jit_instr inst0 = inst_buf[0];
    631 	struct jit_instr inst1 = inst_buf[1];
    632 	struct jit_instr inst2 = inst_buf[2];
    633 	tilegx_bundle_bits bits = 0;
    634 
    635 	/* If the bundle is valid as is, perform the encoding and return 1. */
    636 	if (assign_pipes() == 0) {
    637 		for (i = 0; i < inst_buf_index; i++) {
    638 			bits |= get_bundle_bit(inst_buf + i);
    639 #ifdef TILEGX_JIT_DEBUG
    640 			printf("|%04d", inst_buf[i].line);
    641 #endif
    642 		}
    643 #ifdef TILEGX_JIT_DEBUG
    644 		if (inst_buf_index == 3)
    645 			printf("|M0|:\t");
    646 		else
    647 			printf("|M0|:\t\t");
    648 		print_insn_tilegx(&bits);
    649 #endif
    650 
    651 		inst_buf_index = 0;
    652 
    653 #ifdef TILEGX_JIT_DEBUG
    654 		return push_inst_nodebug(compiler, bits);
    655 #else
    656 		return push_inst(compiler, bits);
    657 #endif
    658 	}
    659 
    660 	/* If the bundle is invalid, split it in two. First encode the first two
    661 	   (or possibly 1) instructions, and then the last, separately. Note that
    662 	   assign_pipes may have re-ordered the instrs (by inserting no-ops in
    663 	   lower slots) so we need to reset them. */
    664 
    665 	inst_buf_index = orig_index - 1;
    666 	inst_buf[0] = inst0;
    667 	inst_buf[1] = inst1;
    668 	inst_buf[2] = inst2;
    669 	if (assign_pipes() == 0) {
    670 		for (i = 0; i < inst_buf_index; i++) {
    671 			bits |= get_bundle_bit(inst_buf + i);
    672 #ifdef TILEGX_JIT_DEBUG
    673 			printf("|%04d", inst_buf[i].line);
    674 #endif
    675 		}
    676 
    677 #ifdef TILEGX_JIT_DEBUG
    678 		if (inst_buf_index == 3)
    679 			printf("|M1|:\t");
    680 		else
    681 			printf("|M1|:\t\t");
    682 		print_insn_tilegx(&bits);
    683 #endif
    684 
    685 		if ((orig_index - 1) == 2) {
    686 			inst_buf[0] = inst2;
    687 			inst_buf_index = 1;
    688 		} else if ((orig_index - 1) == 1) {
    689 			inst_buf[0] = inst1;
    690 			inst_buf_index = 1;
    691 		} else
    692 			SLJIT_UNREACHABLE();
    693 
    694 #ifdef TILEGX_JIT_DEBUG
    695 		return push_inst_nodebug(compiler, bits);
    696 #else
    697 		return push_inst(compiler, bits);
    698 #endif
    699 	} else {
    700 		/* We had 3 instrs of which the first 2 can't live in the same bundle.
    701 		   Split those two. Note that we don't try to then combine the second
    702 		   and third instr into a single bundle.  First instruction: */
    703 		inst_buf_index = 1;
    704 		inst_buf[0] = inst0;
    705 		inst_buf[1] = inst1;
    706 		inst_buf[2] = inst2;
    707 		if (assign_pipes() == 0) {
    708 			for (i = 0; i < inst_buf_index; i++) {
    709 				bits |= get_bundle_bit(inst_buf + i);
    710 #ifdef TILEGX_JIT_DEBUG
    711 				printf("|%04d", inst_buf[i].line);
    712 #endif
    713 			}
    714 
    715 #ifdef TILEGX_JIT_DEBUG
    716 			if (inst_buf_index == 3)
    717 				printf("|M2|:\t");
    718 			else
    719 				printf("|M2|:\t\t");
    720 			print_insn_tilegx(&bits);
    721 #endif
    722 
    723 			inst_buf[0] = inst1;
    724 			inst_buf[1] = inst2;
    725 			inst_buf_index = orig_index - 1;
    726 #ifdef TILEGX_JIT_DEBUG
    727 			return push_inst_nodebug(compiler, bits);
    728 #else
    729 			return push_inst(compiler, bits);
    730 #endif
    731 		} else
    732 			SLJIT_UNREACHABLE();
    733 	}
    734 
    735 	SLJIT_UNREACHABLE();
    736 }
    737 
    738 static sljit_s32 flush_buffer(struct sljit_compiler *compiler)
    739 {
    740 	while (inst_buf_index != 0) {
    741 		FAIL_IF(update_buffer(compiler));
    742 	}
    743 	return SLJIT_SUCCESS;
    744 }
    745 
    746 static sljit_s32 push_4_buffer(struct sljit_compiler *compiler, tilegx_mnemonic opc, int op0, int op1, int op2, int op3, int line)
    747 {
    748 	if (inst_buf_index == TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE)
    749 		FAIL_IF(update_buffer(compiler));
    750 
    751 	const struct tilegx_opcode* opcode = &tilegx_opcodes[opc];
    752 	inst_buf[inst_buf_index].opcode = opcode;
    753 	inst_buf[inst_buf_index].pipe = get_any_valid_pipe(opcode);
    754 	inst_buf[inst_buf_index].operand_value[0] = op0;
    755 	inst_buf[inst_buf_index].operand_value[1] = op1;
    756 	inst_buf[inst_buf_index].operand_value[2] = op2;
    757 	inst_buf[inst_buf_index].operand_value[3] = op3;
    758 	inst_buf[inst_buf_index].input_registers = 1L << op1;
    759 	inst_buf[inst_buf_index].output_registers = 1L << op0;
    760 	inst_buf[inst_buf_index].line = line;
    761 	inst_buf_index++;
    762 
    763 	return SLJIT_SUCCESS;
    764 }
    765 
    766 static sljit_s32 push_3_buffer(struct sljit_compiler *compiler, tilegx_mnemonic opc, int op0, int op1, int op2, int line)
    767 {
    768 	if (inst_buf_index == TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE)
    769 		FAIL_IF(update_buffer(compiler));
    770 
    771 	const struct tilegx_opcode* opcode = &tilegx_opcodes[opc];
    772 	inst_buf[inst_buf_index].opcode = opcode;
    773 	inst_buf[inst_buf_index].pipe = get_any_valid_pipe(opcode);
    774 	inst_buf[inst_buf_index].operand_value[0] = op0;
    775 	inst_buf[inst_buf_index].operand_value[1] = op1;
    776 	inst_buf[inst_buf_index].operand_value[2] = op2;
    777 	inst_buf[inst_buf_index].line = line;
    778 
    779 	switch (opc) {
    780 	case TILEGX_OPC_ST_ADD:
    781 		inst_buf[inst_buf_index].input_registers = (1L << op0) | (1L << op1);
    782 		inst_buf[inst_buf_index].output_registers = 1L << op0;
    783 		break;
    784 	case TILEGX_OPC_LD_ADD:
    785 		inst_buf[inst_buf_index].input_registers = 1L << op1;
    786 		inst_buf[inst_buf_index].output_registers = (1L << op0) | (1L << op1);
    787 		break;
    788 	case TILEGX_OPC_ADD:
    789 	case TILEGX_OPC_AND:
    790 	case TILEGX_OPC_SUB:
    791 	case TILEGX_OPC_MULX:
    792 	case TILEGX_OPC_OR:
    793 	case TILEGX_OPC_XOR:
    794 	case TILEGX_OPC_NOR:
    795 	case TILEGX_OPC_SHL:
    796 	case TILEGX_OPC_SHRU:
    797 	case TILEGX_OPC_SHRS:
    798 	case TILEGX_OPC_CMPLTU:
    799 	case TILEGX_OPC_CMPLTS:
    800 	case TILEGX_OPC_CMOVEQZ:
    801 	case TILEGX_OPC_CMOVNEZ:
    802 		inst_buf[inst_buf_index].input_registers = (1L << op1) | (1L << op2);
    803 		inst_buf[inst_buf_index].output_registers = 1L << op0;
    804 		break;
    805 	case TILEGX_OPC_ADDLI:
    806 	case TILEGX_OPC_XORI:
    807 	case TILEGX_OPC_ORI:
    808 	case TILEGX_OPC_SHLI:
    809 	case TILEGX_OPC_SHRUI:
    810 	case TILEGX_OPC_SHRSI:
    811 	case TILEGX_OPC_SHL16INSLI:
    812 	case TILEGX_OPC_CMPLTUI:
    813 	case TILEGX_OPC_CMPLTSI:
    814 		inst_buf[inst_buf_index].input_registers = 1L << op1;
    815 		inst_buf[inst_buf_index].output_registers = 1L << op0;
    816 		break;
    817 	default:
    818 		printf("unrecoginzed opc: %s\n", opcode->name);
    819 		SLJIT_UNREACHABLE();
    820 	}
    821 
    822 	inst_buf_index++;
    823 
    824 	return SLJIT_SUCCESS;
    825 }
    826 
    827 static sljit_s32 push_2_buffer(struct sljit_compiler *compiler, tilegx_mnemonic opc, int op0, int op1, int line)
    828 {
    829 	if (inst_buf_index == TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE)
    830 		FAIL_IF(update_buffer(compiler));
    831 
    832 	const struct tilegx_opcode* opcode = &tilegx_opcodes[opc];
    833 	inst_buf[inst_buf_index].opcode = opcode;
    834 	inst_buf[inst_buf_index].pipe = get_any_valid_pipe(opcode);
    835 	inst_buf[inst_buf_index].operand_value[0] = op0;
    836 	inst_buf[inst_buf_index].operand_value[1] = op1;
    837 	inst_buf[inst_buf_index].line = line;
    838 
    839 	switch (opc) {
    840 	case TILEGX_OPC_BEQZ:
    841 	case TILEGX_OPC_BNEZ:
    842 		inst_buf[inst_buf_index].input_registers = 1L << op0;
    843 		break;
    844 	case TILEGX_OPC_ST:
    845 	case TILEGX_OPC_ST1:
    846 	case TILEGX_OPC_ST2:
    847 	case TILEGX_OPC_ST4:
    848 		inst_buf[inst_buf_index].input_registers = (1L << op0) | (1L << op1);
    849 		inst_buf[inst_buf_index].output_registers = 0;
    850 		break;
    851 	case TILEGX_OPC_CLZ:
    852 	case TILEGX_OPC_LD:
    853 	case TILEGX_OPC_LD1U:
    854 	case TILEGX_OPC_LD1S:
    855 	case TILEGX_OPC_LD2U:
    856 	case TILEGX_OPC_LD2S:
    857 	case TILEGX_OPC_LD4U:
    858 	case TILEGX_OPC_LD4S:
    859 		inst_buf[inst_buf_index].input_registers = 1L << op1;
    860 		inst_buf[inst_buf_index].output_registers = 1L << op0;
    861 		break;
    862 	default:
    863 		printf("unrecoginzed opc: %s\n", opcode->name);
    864 		SLJIT_UNREACHABLE();
    865 	}
    866 
    867 	inst_buf_index++;
    868 
    869 	return SLJIT_SUCCESS;
    870 }
    871 
    872 static sljit_s32 push_0_buffer(struct sljit_compiler *compiler, tilegx_mnemonic opc, int line)
    873 {
    874 	if (inst_buf_index == TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE)
    875 		FAIL_IF(update_buffer(compiler));
    876 
    877 	const struct tilegx_opcode* opcode = &tilegx_opcodes[opc];
    878 	inst_buf[inst_buf_index].opcode = opcode;
    879 	inst_buf[inst_buf_index].pipe = get_any_valid_pipe(opcode);
    880 	inst_buf[inst_buf_index].input_registers = 0;
    881 	inst_buf[inst_buf_index].output_registers = 0;
    882 	inst_buf[inst_buf_index].line = line;
    883 	inst_buf_index++;
    884 
    885 	return SLJIT_SUCCESS;
    886 }
    887 
    888 static sljit_s32 push_jr_buffer(struct sljit_compiler *compiler, tilegx_mnemonic opc, int op0, int line)
    889 {
    890 	if (inst_buf_index == TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE)
    891 		FAIL_IF(update_buffer(compiler));
    892 
    893 	const struct tilegx_opcode* opcode = &tilegx_opcodes[opc];
    894 	inst_buf[inst_buf_index].opcode = opcode;
    895 	inst_buf[inst_buf_index].pipe = get_any_valid_pipe(opcode);
    896 	inst_buf[inst_buf_index].operand_value[0] = op0;
    897 	inst_buf[inst_buf_index].input_registers = 1L << op0;
    898 	inst_buf[inst_buf_index].output_registers = 0;
    899 	inst_buf[inst_buf_index].line = line;
    900 	inst_buf_index++;
    901 
    902 	return flush_buffer(compiler);
    903 }
    904 
    905 static SLJIT_INLINE sljit_ins * detect_jump_type(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code)
    906 {
    907 	sljit_sw diff;
    908 	sljit_uw target_addr;
    909 	sljit_ins *inst;
    910 
    911 	if (jump->flags & SLJIT_REWRITABLE_JUMP)
    912 		return code_ptr;
    913 
    914 	if (jump->flags & JUMP_ADDR)
    915 		target_addr = jump->u.target;
    916 	else {
    917 		SLJIT_ASSERT(jump->flags & JUMP_LABEL);
    918 		target_addr = (sljit_uw)(code + jump->u.label->size);
    919 	}
    920 
    921 	inst = (sljit_ins *)jump->addr;
    922 	if (jump->flags & IS_COND)
    923 		inst--;
    924 
    925 	diff = ((sljit_sw) target_addr - (sljit_sw) inst) >> 3;
    926 	if (diff <= SIMM_17BIT_MAX && diff >= SIMM_17BIT_MIN) {
    927 		jump->flags |= PATCH_B;
    928 
    929 		if (!(jump->flags & IS_COND)) {
    930 			if (jump->flags & IS_JAL) {
    931 				jump->flags &= ~(PATCH_B);
    932 				jump->flags |= PATCH_J;
    933 				inst[0] = JAL_X1;
    934 
    935 #ifdef TILEGX_JIT_DEBUG
    936 				printf("[runtime relocate]%04d:\t", __LINE__);
    937 				print_insn_tilegx(inst);
    938 #endif
    939 			} else {
    940 				inst[0] = BEQZ_X1 | SRCA_X1(ZERO);
    941 
    942 #ifdef TILEGX_JIT_DEBUG
    943 				printf("[runtime relocate]%04d:\t", __LINE__);
    944 				print_insn_tilegx(inst);
    945 #endif
    946 			}
    947 
    948 			return inst;
    949 		}
    950 
    951 		inst[0] = inst[0] ^ (0x7L << 55);
    952 
    953 #ifdef TILEGX_JIT_DEBUG
    954 		printf("[runtime relocate]%04d:\t", __LINE__);
    955 		print_insn_tilegx(inst);
    956 #endif
    957 		jump->addr -= sizeof(sljit_ins);
    958 		return inst;
    959 	}
    960 
    961 	if (jump->flags & IS_COND) {
    962 		if ((target_addr & ~0x3FFFFFFFL) == ((jump->addr + sizeof(sljit_ins)) & ~0x3FFFFFFFL)) {
    963 			jump->flags |= PATCH_J;
    964 			inst[0] = (inst[0] & ~(BOFF_X1(-1))) | BOFF_X1(2);
    965 			inst[1] = J_X1;
    966 			return inst + 1;
    967 		}
    968 
    969 		return code_ptr;
    970 	}
    971 
    972 	if ((target_addr & ~0x3FFFFFFFL) == ((jump->addr + sizeof(sljit_ins)) & ~0x3FFFFFFFL)) {
    973 		jump->flags |= PATCH_J;
    974 
    975 		if (jump->flags & IS_JAL) {
    976 			inst[0] = JAL_X1;
    977 
    978 #ifdef TILEGX_JIT_DEBUG
    979 			printf("[runtime relocate]%04d:\t", __LINE__);
    980 			print_insn_tilegx(inst);
    981 #endif
    982 
    983 		} else {
    984 			inst[0] = J_X1;
    985 
    986 #ifdef TILEGX_JIT_DEBUG
    987 			printf("[runtime relocate]%04d:\t", __LINE__);
    988 			print_insn_tilegx(inst);
    989 #endif
    990 		}
    991 
    992 		return inst;
    993 	}
    994 
    995 	return code_ptr;
    996 }
    997 
    998 SLJIT_API_FUNC_ATTRIBUTE void * sljit_generate_code(struct sljit_compiler *compiler)
    999 {
   1000 	struct sljit_memory_fragment *buf;
   1001 	sljit_ins *code;
   1002 	sljit_ins *code_ptr;
   1003 	sljit_ins *buf_ptr;
   1004 	sljit_ins *buf_end;
   1005 	sljit_uw word_count;
   1006 	sljit_uw addr;
   1007 
   1008 	struct sljit_label *label;
   1009 	struct sljit_jump *jump;
   1010 	struct sljit_const *const_;
   1011 
   1012 	CHECK_ERROR_PTR();
   1013 	CHECK_PTR(check_sljit_generate_code(compiler));
   1014 	reverse_buf(compiler);
   1015 
   1016 	code = (sljit_ins *)SLJIT_MALLOC_EXEC(compiler->size * sizeof(sljit_ins));
   1017 	PTR_FAIL_WITH_EXEC_IF(code);
   1018 	buf = compiler->buf;
   1019 
   1020 	code_ptr = code;
   1021 	word_count = 0;
   1022 	label = compiler->labels;
   1023 	jump = compiler->jumps;
   1024 	const_ = compiler->consts;
   1025 	do {
   1026 		buf_ptr = (sljit_ins *)buf->memory;
   1027 		buf_end = buf_ptr + (buf->used_size >> 3);
   1028 		do {
   1029 			*code_ptr = *buf_ptr++;
   1030 			SLJIT_ASSERT(!label || label->size >= word_count);
   1031 			SLJIT_ASSERT(!jump || jump->addr >= word_count);
   1032 			SLJIT_ASSERT(!const_ || const_->addr >= word_count);
   1033 			/* These structures are ordered by their address. */
   1034 			if (label && label->size == word_count) {
   1035 				/* Just recording the address. */
   1036 				label->addr = (sljit_uw) code_ptr;
   1037 				label->size = code_ptr - code;
   1038 				label = label->next;
   1039 			}
   1040 
   1041 			if (jump && jump->addr == word_count) {
   1042 				if (jump->flags & IS_JAL)
   1043 					jump->addr = (sljit_uw)(code_ptr - 4);
   1044 				else
   1045 					jump->addr = (sljit_uw)(code_ptr - 3);
   1046 
   1047 				code_ptr = detect_jump_type(jump, code_ptr, code);
   1048 				jump = jump->next;
   1049 			}
   1050 
   1051 			if (const_ && const_->addr == word_count) {
   1052 				/* Just recording the address. */
   1053 				const_->addr = (sljit_uw) code_ptr;
   1054 				const_ = const_->next;
   1055 			}
   1056 
   1057 			code_ptr++;
   1058 			word_count++;
   1059 		} while (buf_ptr < buf_end);
   1060 
   1061 		buf = buf->next;
   1062 	} while (buf);
   1063 
   1064 	if (label && label->size == word_count) {
   1065 		label->addr = (sljit_uw) code_ptr;
   1066 		label->size = code_ptr - code;
   1067 		label = label->next;
   1068 	}
   1069 
   1070 	SLJIT_ASSERT(!label);
   1071 	SLJIT_ASSERT(!jump);
   1072 	SLJIT_ASSERT(!const_);
   1073 	SLJIT_ASSERT(code_ptr - code <= (sljit_sw)compiler->size);
   1074 
   1075 	jump = compiler->jumps;
   1076 	while (jump) {
   1077 		do {
   1078 			addr = (jump->flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target;
   1079 			buf_ptr = (sljit_ins *)jump->addr;
   1080 
   1081 			if (jump->flags & PATCH_B) {
   1082 				addr = (sljit_sw)(addr - (jump->addr)) >> 3;
   1083 				SLJIT_ASSERT((sljit_sw) addr <= SIMM_17BIT_MAX && (sljit_sw) addr >= SIMM_17BIT_MIN);
   1084 				buf_ptr[0] = (buf_ptr[0] & ~(BOFF_X1(-1))) | BOFF_X1(addr);
   1085 
   1086 #ifdef TILEGX_JIT_DEBUG
   1087 				printf("[runtime relocate]%04d:\t", __LINE__);
   1088 				print_insn_tilegx(buf_ptr);
   1089 #endif
   1090 				break;
   1091 			}
   1092 
   1093 			if (jump->flags & PATCH_J) {
   1094 				SLJIT_ASSERT((addr & ~0x3FFFFFFFL) == ((jump->addr + sizeof(sljit_ins)) & ~0x3FFFFFFFL));
   1095 				addr = (sljit_sw)(addr - (jump->addr)) >> 3;
   1096 				buf_ptr[0] = (buf_ptr[0] & ~(JOFF_X1(-1))) | JOFF_X1(addr);
   1097 
   1098 #ifdef TILEGX_JIT_DEBUG
   1099 				printf("[runtime relocate]%04d:\t", __LINE__);
   1100 				print_insn_tilegx(buf_ptr);
   1101 #endif
   1102 				break;
   1103 			}
   1104 
   1105 			SLJIT_ASSERT(!(jump->flags & IS_JAL));
   1106 
   1107 			/* Set the fields of immediate loads. */
   1108 			buf_ptr[0] = (buf_ptr[0] & ~(0xFFFFL << 43)) | (((addr >> 32) & 0xFFFFL) << 43);
   1109 			buf_ptr[1] = (buf_ptr[1] & ~(0xFFFFL << 43)) | (((addr >> 16) & 0xFFFFL) << 43);
   1110 			buf_ptr[2] = (buf_ptr[2] & ~(0xFFFFL << 43)) | ((addr & 0xFFFFL) << 43);
   1111 		} while (0);
   1112 
   1113 		jump = jump->next;
   1114 	}
   1115 
   1116 	compiler->error = SLJIT_ERR_COMPILED;
   1117 	compiler->executable_size = (code_ptr - code) * sizeof(sljit_ins);
   1118 	SLJIT_CACHE_FLUSH(code, code_ptr);
   1119 	return code;
   1120 }
   1121 
   1122 static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_ar, sljit_sw imm)
   1123 {
   1124 
   1125 	if (imm <= SIMM_16BIT_MAX && imm >= SIMM_16BIT_MIN)
   1126 		return ADDLI(dst_ar, ZERO, imm);
   1127 
   1128 	if (imm <= SIMM_32BIT_MAX && imm >= SIMM_32BIT_MIN) {
   1129 		FAIL_IF(ADDLI(dst_ar, ZERO, imm >> 16));
   1130 		return SHL16INSLI(dst_ar, dst_ar, imm);
   1131 	}
   1132 
   1133 	if (imm <= SIMM_48BIT_MAX && imm >= SIMM_48BIT_MIN) {
   1134 		FAIL_IF(ADDLI(dst_ar, ZERO, imm >> 32));
   1135 		FAIL_IF(SHL16INSLI(dst_ar, dst_ar, imm >> 16));
   1136 		return SHL16INSLI(dst_ar, dst_ar, imm);
   1137 	}
   1138 
   1139 	FAIL_IF(ADDLI(dst_ar, ZERO, imm >> 48));
   1140 	FAIL_IF(SHL16INSLI(dst_ar, dst_ar, imm >> 32));
   1141 	FAIL_IF(SHL16INSLI(dst_ar, dst_ar, imm >> 16));
   1142 	return SHL16INSLI(dst_ar, dst_ar, imm);
   1143 }
   1144 
   1145 static sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst_ar, sljit_sw imm, int flush)
   1146 {
   1147 	/* Should *not* be optimized as load_immediate, as pcre relocation
   1148 	   mechanism will match this fixed 4-instruction pattern. */
   1149 	if (flush) {
   1150 		FAIL_IF(ADDLI_SOLO(dst_ar, ZERO, imm >> 32));
   1151 		FAIL_IF(SHL16INSLI_SOLO(dst_ar, dst_ar, imm >> 16));
   1152 		return SHL16INSLI_SOLO(dst_ar, dst_ar, imm);
   1153 	}
   1154 
   1155 	FAIL_IF(ADDLI(dst_ar, ZERO, imm >> 32));
   1156 	FAIL_IF(SHL16INSLI(dst_ar, dst_ar, imm >> 16));
   1157 	return SHL16INSLI(dst_ar, dst_ar, imm);
   1158 }
   1159 
   1160 static sljit_s32 emit_const_64(struct sljit_compiler *compiler, sljit_s32 dst_ar, sljit_sw imm, int flush)
   1161 {
   1162 	/* Should *not* be optimized as load_immediate, as pcre relocation
   1163 	   mechanism will match this fixed 4-instruction pattern. */
   1164 	if (flush) {
   1165 		FAIL_IF(ADDLI_SOLO(reg_map[dst_ar], ZERO, imm >> 48));
   1166 		FAIL_IF(SHL16INSLI_SOLO(reg_map[dst_ar], reg_map[dst_ar], imm >> 32));
   1167 		FAIL_IF(SHL16INSLI_SOLO(reg_map[dst_ar], reg_map[dst_ar], imm >> 16));
   1168 		return SHL16INSLI_SOLO(reg_map[dst_ar], reg_map[dst_ar], imm);
   1169 	}
   1170 
   1171 	FAIL_IF(ADDLI(reg_map[dst_ar], ZERO, imm >> 48));
   1172 	FAIL_IF(SHL16INSLI(reg_map[dst_ar], reg_map[dst_ar], imm >> 32));
   1173 	FAIL_IF(SHL16INSLI(reg_map[dst_ar], reg_map[dst_ar], imm >> 16));
   1174 	return SHL16INSLI(reg_map[dst_ar], reg_map[dst_ar], imm);
   1175 }
   1176 
   1177 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
   1178 	sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
   1179 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
   1180 {
   1181 	sljit_ins base;
   1182 	sljit_s32 i, tmp;
   1183 
   1184 	CHECK_ERROR();
   1185 	CHECK(check_sljit_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
   1186 	set_emit_enter(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
   1187 
   1188 	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
   1189 	local_size = (local_size + 7) & ~7;
   1190 	compiler->local_size = local_size;
   1191 
   1192 	if (local_size <= SIMM_16BIT_MAX) {
   1193 		/* Frequent case. */
   1194 		FAIL_IF(ADDLI(SLJIT_LOCALS_REG_mapped, SLJIT_LOCALS_REG_mapped, -local_size));
   1195 		base = SLJIT_LOCALS_REG_mapped;
   1196 	} else {
   1197 		FAIL_IF(load_immediate(compiler, TMP_REG1_mapped, local_size));
   1198 		FAIL_IF(ADD(TMP_REG2_mapped, SLJIT_LOCALS_REG_mapped, ZERO));
   1199 		FAIL_IF(SUB(SLJIT_LOCALS_REG_mapped, SLJIT_LOCALS_REG_mapped, TMP_REG1_mapped));
   1200 		base = TMP_REG2_mapped;
   1201 		local_size = 0;
   1202 	}
   1203 
   1204 	/* Save the return address. */
   1205 	FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 8));
   1206 	FAIL_IF(ST_ADD(ADDR_TMP_mapped, RA, -8));
   1207 
   1208 	/* Save the S registers. */
   1209 	tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
   1210 	for (i = SLJIT_S0; i >= tmp; i--) {
   1211 		FAIL_IF(ST_ADD(ADDR_TMP_mapped, reg_map[i], -8));
   1212 	}
   1213 
   1214 	/* Save the R registers that need to be reserved. */
   1215 	for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
   1216 		FAIL_IF(ST_ADD(ADDR_TMP_mapped, reg_map[i], -8));
   1217 	}
   1218 
   1219 	/* Move the arguments to S registers. */
   1220 	for (i = 0; i < args; i++) {
   1221 		FAIL_IF(ADD(reg_map[SLJIT_S0 - i], i, ZERO));
   1222 	}
   1223 
   1224 	return SLJIT_SUCCESS;
   1225 }
   1226 
   1227 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
   1228 	sljit_s32 options, sljit_s32 args, sljit_s32 scratches, sljit_s32 saveds,
   1229 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
   1230 {
   1231 	CHECK_ERROR();
   1232 	CHECK(check_sljit_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size));
   1233 	set_set_context(compiler, options, args, scratches, saveds, fscratches, fsaveds, local_size);
   1234 
   1235 	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
   1236 	compiler->local_size = (local_size + 7) & ~7;
   1237 
   1238 	return SLJIT_SUCCESS;
   1239 }
   1240 
   1241 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
   1242 {
   1243 	sljit_s32 local_size;
   1244 	sljit_ins base;
   1245 	sljit_s32 i, tmp;
   1246 	sljit_s32 saveds;
   1247 
   1248 	CHECK_ERROR();
   1249 	CHECK(check_sljit_emit_return(compiler, op, src, srcw));
   1250 
   1251 	FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
   1252 
   1253 	local_size = compiler->local_size;
   1254 	if (local_size <= SIMM_16BIT_MAX)
   1255 		base = SLJIT_LOCALS_REG_mapped;
   1256 	else {
   1257 		FAIL_IF(load_immediate(compiler, TMP_REG1_mapped, local_size));
   1258 		FAIL_IF(ADD(TMP_REG1_mapped, SLJIT_LOCALS_REG_mapped, TMP_REG1_mapped));
   1259 		base = TMP_REG1_mapped;
   1260 		local_size = 0;
   1261 	}
   1262 
   1263 	/* Restore the return address. */
   1264 	FAIL_IF(ADDLI(ADDR_TMP_mapped, base, local_size - 8));
   1265 	FAIL_IF(LD_ADD(RA, ADDR_TMP_mapped, -8));
   1266 
   1267 	/* Restore the S registers. */
   1268 	saveds = compiler->saveds;
   1269 	tmp = saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - saveds) : SLJIT_FIRST_SAVED_REG;
   1270 	for (i = SLJIT_S0; i >= tmp; i--) {
   1271 		FAIL_IF(LD_ADD(reg_map[i], ADDR_TMP_mapped, -8));
   1272 	}
   1273 
   1274 	/* Restore the R registers that need to be reserved. */
   1275 	for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
   1276 		FAIL_IF(LD_ADD(reg_map[i], ADDR_TMP_mapped, -8));
   1277 	}
   1278 
   1279 	if (compiler->local_size <= SIMM_16BIT_MAX)
   1280 		FAIL_IF(ADDLI(SLJIT_LOCALS_REG_mapped, SLJIT_LOCALS_REG_mapped, compiler->local_size));
   1281 	else
   1282 		FAIL_IF(ADD(SLJIT_LOCALS_REG_mapped, TMP_REG1_mapped, ZERO));
   1283 
   1284 	return JR(RA);
   1285 }
   1286 
   1287 /* reg_ar is an absoulute register! */
   1288 
   1289 /* Can perform an operation using at most 1 instruction. */
   1290 static sljit_s32 getput_arg_fast(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg_ar, sljit_s32 arg, sljit_sw argw)
   1291 {
   1292 	SLJIT_ASSERT(arg & SLJIT_MEM);
   1293 
   1294 	if ((!(flags & WRITE_BACK) || !(arg & REG_MASK))
   1295 			&& !(arg & OFFS_REG_MASK) && argw <= SIMM_16BIT_MAX && argw >= SIMM_16BIT_MIN) {
   1296 		/* Works for both absoulte and relative addresses. */
   1297 		if (SLJIT_UNLIKELY(flags & ARG_TEST))
   1298 			return 1;
   1299 
   1300 		FAIL_IF(ADDLI(ADDR_TMP_mapped, reg_map[arg & REG_MASK], argw));
   1301 
   1302 		if (flags & LOAD_DATA)
   1303 			FAIL_IF(PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, ADDR_TMP_mapped));
   1304 		else
   1305 			FAIL_IF(PB2(data_transfer_insts[flags & MEM_MASK], ADDR_TMP_mapped, reg_ar));
   1306 
   1307 		return -1;
   1308 	}
   1309 
   1310 	return 0;
   1311 }
   1312 
   1313 /* See getput_arg below.
   1314    Note: can_cache is called only for binary operators. Those
   1315    operators always uses word arguments without write back. */
   1316 static sljit_s32 can_cache(sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
   1317 {
   1318 	SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));
   1319 
   1320 	/* Simple operation except for updates. */
   1321 	if (arg & OFFS_REG_MASK) {
   1322 		argw &= 0x3;
   1323 		next_argw &= 0x3;
   1324 		if (argw && argw == next_argw
   1325 				&& (arg == next_arg || (arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK)))
   1326 			return 1;
   1327 		return 0;
   1328 	}
   1329 
   1330 	if (arg == next_arg) {
   1331 		if (((next_argw - argw) <= SIMM_16BIT_MAX
   1332 				&& (next_argw - argw) >= SIMM_16BIT_MIN))
   1333 			return 1;
   1334 
   1335 		return 0;
   1336 	}
   1337 
   1338 	return 0;
   1339 }
   1340 
   1341 /* Emit the necessary instructions. See can_cache above. */
   1342 static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg_ar, sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
   1343 {
   1344 	sljit_s32 tmp_ar, base;
   1345 
   1346 	SLJIT_ASSERT(arg & SLJIT_MEM);
   1347 	if (!(next_arg & SLJIT_MEM)) {
   1348 		next_arg = 0;
   1349 		next_argw = 0;
   1350 	}
   1351 
   1352 	if ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA))
   1353 		tmp_ar = reg_ar;
   1354 	else
   1355 		tmp_ar = TMP_REG1_mapped;
   1356 
   1357 	base = arg & REG_MASK;
   1358 
   1359 	if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
   1360 		argw &= 0x3;
   1361 
   1362 		if ((flags & WRITE_BACK) && reg_ar == reg_map[base]) {
   1363 			SLJIT_ASSERT(!(flags & LOAD_DATA) && reg_map[TMP_REG1] != reg_ar);
   1364 			FAIL_IF(ADD(TMP_REG1_mapped, reg_ar, ZERO));
   1365 			reg_ar = TMP_REG1_mapped;
   1366 		}
   1367 
   1368 		/* Using the cache. */
   1369 		if (argw == compiler->cache_argw) {
   1370 			if (!(flags & WRITE_BACK)) {
   1371 				if (arg == compiler->cache_arg) {
   1372 					if (flags & LOAD_DATA)
   1373 						return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, TMP_REG3_mapped);
   1374 					else
   1375 						return PB2(data_transfer_insts[flags & MEM_MASK], TMP_REG3_mapped, reg_ar);
   1376 				}
   1377 
   1378 				if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) {
   1379 					if (arg == next_arg && argw == (next_argw & 0x3)) {
   1380 						compiler->cache_arg = arg;
   1381 						compiler->cache_argw = argw;
   1382 						FAIL_IF(ADD(TMP_REG3_mapped, reg_map[base], TMP_REG3_mapped));
   1383 						if (flags & LOAD_DATA)
   1384 							return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, TMP_REG3_mapped);
   1385 						else
   1386 							return PB2(data_transfer_insts[flags & MEM_MASK], TMP_REG3_mapped, reg_ar);
   1387 					}
   1388 
   1389 					FAIL_IF(ADD(tmp_ar, reg_map[base], TMP_REG3_mapped));
   1390 					if (flags & LOAD_DATA)
   1391 						return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, tmp_ar);
   1392 					else
   1393 						return PB2(data_transfer_insts[flags & MEM_MASK], tmp_ar, reg_ar);
   1394 				}
   1395 			} else {
   1396 				if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) {
   1397 					FAIL_IF(ADD(reg_map[base], reg_map[base], TMP_REG3_mapped));
   1398 					if (flags & LOAD_DATA)
   1399 						return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, reg_map[base]);
   1400 					else
   1401 						return PB2(data_transfer_insts[flags & MEM_MASK], reg_map[base], reg_ar);
   1402 				}
   1403 			}
   1404 		}
   1405 
   1406 		if (SLJIT_UNLIKELY(argw)) {
   1407 			compiler->cache_arg = SLJIT_MEM | (arg & OFFS_REG_MASK);
   1408 			compiler->cache_argw = argw;
   1409 			FAIL_IF(SHLI(TMP_REG3_mapped, reg_map[OFFS_REG(arg)], argw));
   1410 		}
   1411 
   1412 		if (!(flags & WRITE_BACK)) {
   1413 			if (arg == next_arg && argw == (next_argw & 0x3)) {
   1414 				compiler->cache_arg = arg;
   1415 				compiler->cache_argw = argw;
   1416 				FAIL_IF(ADD(TMP_REG3_mapped, reg_map[base], reg_map[!argw ? OFFS_REG(arg) : TMP_REG3]));
   1417 				tmp_ar = TMP_REG3_mapped;
   1418 			} else
   1419 				FAIL_IF(ADD(tmp_ar, reg_map[base], reg_map[!argw ? OFFS_REG(arg) : TMP_REG3]));
   1420 
   1421 			if (flags & LOAD_DATA)
   1422 				return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, tmp_ar);
   1423 			else
   1424 				return PB2(data_transfer_insts[flags & MEM_MASK], tmp_ar, reg_ar);
   1425 		}
   1426 
   1427 		FAIL_IF(ADD(reg_map[base], reg_map[base], reg_map[!argw ? OFFS_REG(arg) : TMP_REG3]));
   1428 
   1429 		if (flags & LOAD_DATA)
   1430 			return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, reg_map[base]);
   1431 		else
   1432 			return PB2(data_transfer_insts[flags & MEM_MASK], reg_map[base], reg_ar);
   1433 	}
   1434 
   1435 	if (SLJIT_UNLIKELY(flags & WRITE_BACK) && base) {
   1436 		/* Update only applies if a base register exists. */
   1437 		if (reg_ar == reg_map[base]) {
   1438 			SLJIT_ASSERT(!(flags & LOAD_DATA) && TMP_REG1_mapped != reg_ar);
   1439 			if (argw <= SIMM_16BIT_MAX && argw >= SIMM_16BIT_MIN) {
   1440 				FAIL_IF(ADDLI(ADDR_TMP_mapped, reg_map[base], argw));
   1441 				if (flags & LOAD_DATA)
   1442 					FAIL_IF(PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, ADDR_TMP_mapped));
   1443 				else
   1444 					FAIL_IF(PB2(data_transfer_insts[flags & MEM_MASK], ADDR_TMP_mapped, reg_ar));
   1445 
   1446 				if (argw)
   1447 					return ADDLI(reg_map[base], reg_map[base], argw);
   1448 
   1449 				return SLJIT_SUCCESS;
   1450 			}
   1451 
   1452 			FAIL_IF(ADD(TMP_REG1_mapped, reg_ar, ZERO));
   1453 			reg_ar = TMP_REG1_mapped;
   1454 		}
   1455 
   1456 		if (argw <= SIMM_16BIT_MAX && argw >= SIMM_16BIT_MIN) {
   1457 			if (argw)
   1458 				FAIL_IF(ADDLI(reg_map[base], reg_map[base], argw));
   1459 		} else {
   1460 			if (compiler->cache_arg == SLJIT_MEM
   1461 					&& argw - compiler->cache_argw <= SIMM_16BIT_MAX
   1462 					&& argw - compiler->cache_argw >= SIMM_16BIT_MIN) {
   1463 				if (argw != compiler->cache_argw) {
   1464 					FAIL_IF(ADD(TMP_REG3_mapped, TMP_REG3_mapped, argw - compiler->cache_argw));
   1465 					compiler->cache_argw = argw;
   1466 				}
   1467 
   1468 				FAIL_IF(ADD(reg_map[base], reg_map[base], TMP_REG3_mapped));
   1469 			} else {
   1470 				compiler->cache_arg = SLJIT_MEM;
   1471 				compiler->cache_argw = argw;
   1472 				FAIL_IF(load_immediate(compiler, TMP_REG3_mapped, argw));
   1473 				FAIL_IF(ADD(reg_map[base], reg_map[base], TMP_REG3_mapped));
   1474 			}
   1475 		}
   1476 
   1477 		if (flags & LOAD_DATA)
   1478 			return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, reg_map[base]);
   1479 		else
   1480 			return PB2(data_transfer_insts[flags & MEM_MASK], reg_map[base], reg_ar);
   1481 	}
   1482 
   1483 	if (compiler->cache_arg == arg
   1484 			&& argw - compiler->cache_argw <= SIMM_16BIT_MAX
   1485 			&& argw - compiler->cache_argw >= SIMM_16BIT_MIN) {
   1486 		if (argw != compiler->cache_argw) {
   1487 			FAIL_IF(ADDLI(TMP_REG3_mapped, TMP_REG3_mapped, argw - compiler->cache_argw));
   1488 			compiler->cache_argw = argw;
   1489 		}
   1490 
   1491 		if (flags & LOAD_DATA)
   1492 			return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, TMP_REG3_mapped);
   1493 		else
   1494 			return PB2(data_transfer_insts[flags & MEM_MASK], TMP_REG3_mapped, reg_ar);
   1495 	}
   1496 
   1497 	if (compiler->cache_arg == SLJIT_MEM
   1498 			&& argw - compiler->cache_argw <= SIMM_16BIT_MAX
   1499 			&& argw - compiler->cache_argw >= SIMM_16BIT_MIN) {
   1500 		if (argw != compiler->cache_argw)
   1501 			FAIL_IF(ADDLI(TMP_REG3_mapped, TMP_REG3_mapped, argw - compiler->cache_argw));
   1502 	} else {
   1503 		compiler->cache_arg = SLJIT_MEM;
   1504 		FAIL_IF(load_immediate(compiler, TMP_REG3_mapped, argw));
   1505 	}
   1506 
   1507 	compiler->cache_argw = argw;
   1508 
   1509 	if (!base) {
   1510 		if (flags & LOAD_DATA)
   1511 			return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, TMP_REG3_mapped);
   1512 		else
   1513 			return PB2(data_transfer_insts[flags & MEM_MASK], TMP_REG3_mapped, reg_ar);
   1514 	}
   1515 
   1516 	if (arg == next_arg
   1517 			&& next_argw - argw <= SIMM_16BIT_MAX
   1518 			&& next_argw - argw >= SIMM_16BIT_MIN) {
   1519 		compiler->cache_arg = arg;
   1520 		FAIL_IF(ADD(TMP_REG3_mapped, TMP_REG3_mapped, reg_map[base]));
   1521 		if (flags & LOAD_DATA)
   1522 			return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, TMP_REG3_mapped);
   1523 		else
   1524 			return PB2(data_transfer_insts[flags & MEM_MASK], TMP_REG3_mapped, reg_ar);
   1525 	}
   1526 
   1527 	FAIL_IF(ADD(tmp_ar, TMP_REG3_mapped, reg_map[base]));
   1528 
   1529 	if (flags & LOAD_DATA)
   1530 		return PB2(data_transfer_insts[flags & MEM_MASK], reg_ar, tmp_ar);
   1531 	else
   1532 		return PB2(data_transfer_insts[flags & MEM_MASK], tmp_ar, reg_ar);
   1533 }
   1534 
   1535 static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg_ar, sljit_s32 arg, sljit_sw argw)
   1536 {
   1537 	if (getput_arg_fast(compiler, flags, reg_ar, arg, argw))
   1538 		return compiler->error;
   1539 
   1540 	compiler->cache_arg = 0;
   1541 	compiler->cache_argw = 0;
   1542 	return getput_arg(compiler, flags, reg_ar, arg, argw, 0, 0);
   1543 }
   1544 
   1545 static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg1, sljit_sw arg1w, sljit_s32 arg2, sljit_sw arg2w)
   1546 {
   1547 	if (getput_arg_fast(compiler, flags, reg, arg1, arg1w))
   1548 		return compiler->error;
   1549 	return getput_arg(compiler, flags, reg, arg1, arg1w, arg2, arg2w);
   1550 }
   1551 
   1552 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
   1553 {
   1554 	CHECK_ERROR();
   1555 	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
   1556 	ADJUST_LOCAL_OFFSET(dst, dstw);
   1557 
   1558 	/* For UNUSED dst. Uncommon, but possible. */
   1559 	if (dst == SLJIT_UNUSED)
   1560 		return SLJIT_SUCCESS;
   1561 
   1562 	if (FAST_IS_REG(dst))
   1563 		return ADD(reg_map[dst], RA, ZERO);
   1564 
   1565 	/* Memory. */
   1566 	return emit_op_mem(compiler, WORD_DATA, RA, dst, dstw);
   1567 }
   1568 
   1569 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_return(struct sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw)
   1570 {
   1571 	CHECK_ERROR();
   1572 	CHECK(check_sljit_emit_fast_return(compiler, src, srcw));
   1573 	ADJUST_LOCAL_OFFSET(src, srcw);
   1574 
   1575 	if (FAST_IS_REG(src))
   1576 		FAIL_IF(ADD(RA, reg_map[src], ZERO));
   1577 
   1578 	else if (src & SLJIT_MEM)
   1579 		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, RA, src, srcw));
   1580 
   1581 	else if (src & SLJIT_IMM)
   1582 		FAIL_IF(load_immediate(compiler, RA, srcw));
   1583 
   1584 	return JR(RA);
   1585 }
   1586 
   1587 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags, sljit_s32 dst, sljit_s32 src1, sljit_sw src2)
   1588 {
   1589 	sljit_s32 overflow_ra = 0;
   1590 
   1591 	switch (GET_OPCODE(op)) {
   1592 	case SLJIT_MOV:
   1593 	case SLJIT_MOV_P:
   1594 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
   1595 		if (dst != src2)
   1596 			return ADD(reg_map[dst], reg_map[src2], ZERO);
   1597 		return SLJIT_SUCCESS;
   1598 
   1599 	case SLJIT_MOV_U32:
   1600 	case SLJIT_MOV_S32:
   1601 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
   1602 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
   1603 			if (op == SLJIT_MOV_S32)
   1604 				return BFEXTS(reg_map[dst], reg_map[src2], 0, 31);
   1605 
   1606 			return BFEXTU(reg_map[dst], reg_map[src2], 0, 31);
   1607 		} else if (dst != src2) {
   1608 			SLJIT_ASSERT(src2 == 0);
   1609 			return ADD(reg_map[dst], reg_map[src2], ZERO);
   1610 		}
   1611 
   1612 		return SLJIT_SUCCESS;
   1613 
   1614 	case SLJIT_MOV_U8:
   1615 	case SLJIT_MOV_S8:
   1616 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
   1617 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
   1618 			if (op == SLJIT_MOV_S8)
   1619 				return BFEXTS(reg_map[dst], reg_map[src2], 0, 7);
   1620 
   1621 			return BFEXTU(reg_map[dst], reg_map[src2], 0, 7);
   1622 		} else if (dst != src2) {
   1623 			SLJIT_ASSERT(src2 == 0);
   1624 			return ADD(reg_map[dst], reg_map[src2], ZERO);
   1625 		}
   1626 
   1627 		return SLJIT_SUCCESS;
   1628 
   1629 	case SLJIT_MOV_U16:
   1630 	case SLJIT_MOV_S16:
   1631 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
   1632 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
   1633 			if (op == SLJIT_MOV_S16)
   1634 				return BFEXTS(reg_map[dst], reg_map[src2], 0, 15);
   1635 
   1636 			return BFEXTU(reg_map[dst], reg_map[src2], 0, 15);
   1637 		} else if (dst != src2) {
   1638 			SLJIT_ASSERT(src2 == 0);
   1639 			return ADD(reg_map[dst], reg_map[src2], ZERO);
   1640 		}
   1641 
   1642 		return SLJIT_SUCCESS;
   1643 
   1644 	case SLJIT_NOT:
   1645 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
   1646 		if (op & SLJIT_SET_E)
   1647 			FAIL_IF(NOR(EQUAL_FLAG, reg_map[src2], reg_map[src2]));
   1648 		if (CHECK_FLAGS(SLJIT_SET_E))
   1649 			FAIL_IF(NOR(reg_map[dst], reg_map[src2], reg_map[src2]));
   1650 
   1651 		return SLJIT_SUCCESS;
   1652 
   1653 	case SLJIT_CLZ:
   1654 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
   1655 		if (op & SLJIT_SET_E)
   1656 			FAIL_IF(CLZ(EQUAL_FLAG, reg_map[src2]));
   1657 		if (CHECK_FLAGS(SLJIT_SET_E))
   1658 			FAIL_IF(CLZ(reg_map[dst], reg_map[src2]));
   1659 
   1660 		return SLJIT_SUCCESS;
   1661 
   1662 	case SLJIT_ADD:
   1663 		if (flags & SRC2_IMM) {
   1664 			if (op & SLJIT_SET_O) {
   1665 				FAIL_IF(SHRUI(TMP_EREG1, reg_map[src1], 63));
   1666 				if (src2 < 0)
   1667 					FAIL_IF(XORI(TMP_EREG1, TMP_EREG1, 1));
   1668 			}
   1669 
   1670 			if (op & SLJIT_SET_E)
   1671 				FAIL_IF(ADDLI(EQUAL_FLAG, reg_map[src1], src2));
   1672 
   1673 			if (op & SLJIT_SET_C) {
   1674 				if (src2 >= 0)
   1675 					FAIL_IF(ORI(ULESS_FLAG ,reg_map[src1], src2));
   1676 				else {
   1677 					FAIL_IF(ADDLI(ULESS_FLAG ,ZERO, src2));
   1678 					FAIL_IF(OR(ULESS_FLAG,reg_map[src1],ULESS_FLAG));
   1679 				}
   1680 			}
   1681 
   1682 			/* dst may be the same as src1 or src2. */
   1683 			if (CHECK_FLAGS(SLJIT_SET_E))
   1684 				FAIL_IF(ADDLI(reg_map[dst], reg_map[src1], src2));
   1685 
   1686 			if (op & SLJIT_SET_O) {
   1687 				FAIL_IF(SHRUI(OVERFLOW_FLAG, reg_map[dst], 63));
   1688 
   1689 				if (src2 < 0)
   1690 					FAIL_IF(XORI(OVERFLOW_FLAG, OVERFLOW_FLAG, 1));
   1691 			}
   1692 		} else {
   1693 			if (op & SLJIT_SET_O) {
   1694 				FAIL_IF(XOR(TMP_EREG1, reg_map[src1], reg_map[src2]));
   1695 				FAIL_IF(SHRUI(TMP_EREG1, TMP_EREG1, 63));
   1696 
   1697 				if (src1 != dst)
   1698 					overflow_ra = reg_map[src1];
   1699 				else if (src2 != dst)
   1700 					overflow_ra = reg_map[src2];
   1701 				else {
   1702 					/* Rare ocasion. */
   1703 					FAIL_IF(ADD(TMP_EREG2, reg_map[src1], ZERO));
   1704 					overflow_ra = TMP_EREG2;
   1705 				}
   1706 			}
   1707 
   1708 			if (op & SLJIT_SET_E)
   1709 				FAIL_IF(ADD(EQUAL_FLAG ,reg_map[src1], reg_map[src2]));
   1710 
   1711 			if (op & SLJIT_SET_C)
   1712 				FAIL_IF(OR(ULESS_FLAG,reg_map[src1], reg_map[src2]));
   1713 
   1714 			/* dst may be the same as src1 or src2. */
   1715 			if (CHECK_FLAGS(SLJIT_SET_E))
   1716 				FAIL_IF(ADD(reg_map[dst],reg_map[src1], reg_map[src2]));
   1717 
   1718 			if (op & SLJIT_SET_O) {
   1719 				FAIL_IF(XOR(OVERFLOW_FLAG,reg_map[dst], overflow_ra));
   1720 				FAIL_IF(SHRUI(OVERFLOW_FLAG, OVERFLOW_FLAG, 63));
   1721 			}
   1722 		}
   1723 
   1724 		/* a + b >= a | b (otherwise, the carry should be set to 1). */
   1725 		if (op & SLJIT_SET_C)
   1726 			FAIL_IF(CMPLTU(ULESS_FLAG ,reg_map[dst] ,ULESS_FLAG));
   1727 
   1728 		if (op & SLJIT_SET_O)
   1729 			return CMOVNEZ(OVERFLOW_FLAG, TMP_EREG1, ZERO);
   1730 
   1731 		return SLJIT_SUCCESS;
   1732 
   1733 	case SLJIT_ADDC:
   1734 		if (flags & SRC2_IMM) {
   1735 			if (op & SLJIT_SET_C) {
   1736 				if (src2 >= 0)
   1737 					FAIL_IF(ORI(TMP_EREG1, reg_map[src1], src2));
   1738 				else {
   1739 					FAIL_IF(ADDLI(TMP_EREG1, ZERO, src2));
   1740 					FAIL_IF(OR(TMP_EREG1, reg_map[src1], TMP_EREG1));
   1741 				}
   1742 			}
   1743 
   1744 			FAIL_IF(ADDLI(reg_map[dst], reg_map[src1], src2));
   1745 
   1746 		} else {
   1747 			if (op & SLJIT_SET_C)
   1748 				FAIL_IF(OR(TMP_EREG1, reg_map[src1], reg_map[src2]));
   1749 
   1750 			/* dst may be the same as src1 or src2. */
   1751 			FAIL_IF(ADD(reg_map[dst], reg_map[src1], reg_map[src2]));
   1752 		}
   1753 
   1754 		if (op & SLJIT_SET_C)
   1755 			FAIL_IF(CMPLTU(TMP_EREG1, reg_map[dst], TMP_EREG1));
   1756 
   1757 		FAIL_IF(ADD(reg_map[dst], reg_map[dst], ULESS_FLAG));
   1758 
   1759 		if (!(op & SLJIT_SET_C))
   1760 			return SLJIT_SUCCESS;
   1761 
   1762 		/* Set TMP_EREG2 (dst == 0) && (ULESS_FLAG == 1). */
   1763 		FAIL_IF(CMPLTUI(TMP_EREG2, reg_map[dst], 1));
   1764 		FAIL_IF(AND(TMP_EREG2, TMP_EREG2, ULESS_FLAG));
   1765 		/* Set carry flag. */
   1766 		return OR(ULESS_FLAG, TMP_EREG2, TMP_EREG1);
   1767 
   1768 	case SLJIT_SUB:
   1769 		if ((flags & SRC2_IMM) && ((op & (SLJIT_SET_U | SLJIT_SET_S)) || src2 == SIMM_16BIT_MIN)) {
   1770 			FAIL_IF(ADDLI(TMP_REG2_mapped, ZERO, src2));
   1771 			src2 = TMP_REG2;
   1772 			flags &= ~SRC2_IMM;
   1773 		}
   1774 
   1775 		if (flags & SRC2_IMM) {
   1776 			if (op & SLJIT_SET_O) {
   1777 				FAIL_IF(SHRUI(TMP_EREG1,reg_map[src1], 63));
   1778 
   1779 				if (src2 < 0)
   1780 					FAIL_IF(XORI(TMP_EREG1, TMP_EREG1, 1));
   1781 
   1782 				if (src1 != dst)
   1783 					overflow_ra = reg_map[src1];
   1784 				else {
   1785 					/* Rare ocasion. */
   1786 					FAIL_IF(ADD(TMP_EREG2, reg_map[src1], ZERO));
   1787 					overflow_ra = TMP_EREG2;
   1788 				}
   1789 			}
   1790 
   1791 			if (op & SLJIT_SET_E)
   1792 				FAIL_IF(ADDLI(EQUAL_FLAG, reg_map[src1], -src2));
   1793 
   1794 			if (op & SLJIT_SET_C) {
   1795 				FAIL_IF(load_immediate(compiler, ADDR_TMP_mapped, src2));
   1796 				FAIL_IF(CMPLTU(ULESS_FLAG, reg_map[src1], ADDR_TMP_mapped));
   1797 			}
   1798 
   1799 			/* dst may be the same as src1 or src2. */
   1800 			if (CHECK_FLAGS(SLJIT_SET_E))
   1801 				FAIL_IF(ADDLI(reg_map[dst], reg_map[src1], -src2));
   1802 
   1803 		} else {
   1804 
   1805 			if (op & SLJIT_SET_O) {
   1806 				FAIL_IF(XOR(TMP_EREG1, reg_map[src1], reg_map[src2]));
   1807 				FAIL_IF(SHRUI(TMP_EREG1, TMP_EREG1, 63));
   1808 
   1809 				if (src1 != dst)
   1810 					overflow_ra = reg_map[src1];
   1811 				else {
   1812 					/* Rare ocasion. */
   1813 					FAIL_IF(ADD(TMP_EREG2, reg_map[src1], ZERO));
   1814 					overflow_ra = TMP_EREG2;
   1815 				}
   1816 			}
   1817 
   1818 			if (op & SLJIT_SET_E)
   1819 				FAIL_IF(SUB(EQUAL_FLAG, reg_map[src1], reg_map[src2]));
   1820 
   1821 			if (op & (SLJIT_SET_U | SLJIT_SET_C))
   1822 				FAIL_IF(CMPLTU(ULESS_FLAG, reg_map[src1], reg_map[src2]));
   1823 
   1824 			if (op & SLJIT_SET_U)
   1825 				FAIL_IF(CMPLTU(UGREATER_FLAG, reg_map[src2], reg_map[src1]));
   1826 
   1827 			if (op & SLJIT_SET_S) {
   1828 				FAIL_IF(CMPLTS(LESS_FLAG ,reg_map[src1] ,reg_map[src2]));
   1829 				FAIL_IF(CMPLTS(GREATER_FLAG ,reg_map[src2] ,reg_map[src1]));
   1830 			}
   1831 
   1832 			/* dst may be the same as src1 or src2. */
   1833 			if (CHECK_FLAGS(SLJIT_SET_E | SLJIT_SET_U | SLJIT_SET_S | SLJIT_SET_C))
   1834 				FAIL_IF(SUB(reg_map[dst], reg_map[src1], reg_map[src2]));
   1835 		}
   1836 
   1837 		if (op & SLJIT_SET_O) {
   1838 			FAIL_IF(XOR(OVERFLOW_FLAG, reg_map[dst], overflow_ra));
   1839 			FAIL_IF(SHRUI(OVERFLOW_FLAG, OVERFLOW_FLAG, 63));
   1840 			return CMOVEQZ(OVERFLOW_FLAG, TMP_EREG1, ZERO);
   1841 		}
   1842 
   1843 		return SLJIT_SUCCESS;
   1844 
   1845 	case SLJIT_SUBC:
   1846 		if ((flags & SRC2_IMM) && src2 == SIMM_16BIT_MIN) {
   1847 			FAIL_IF(ADDLI(TMP_REG2_mapped, ZERO, src2));
   1848 			src2 = TMP_REG2;
   1849 			flags &= ~SRC2_IMM;
   1850 		}
   1851 
   1852 		if (flags & SRC2_IMM) {
   1853 			if (op & SLJIT_SET_C) {
   1854 				FAIL_IF(load_immediate(compiler, ADDR_TMP_mapped, -src2));
   1855 				FAIL_IF(CMPLTU(TMP_EREG1, reg_map[src1], ADDR_TMP_mapped));
   1856 			}
   1857 
   1858 			/* dst may be the same as src1 or src2. */
   1859 			FAIL_IF(ADDLI(reg_map[dst], reg_map[src1], -src2));
   1860 
   1861 		} else {
   1862 			if (op & SLJIT_SET_C)
   1863 				FAIL_IF(CMPLTU(TMP_EREG1, reg_map[src1], reg_map[src2]));
   1864 				/* dst may be the same as src1 or src2. */
   1865 			FAIL_IF(SUB(reg_map[dst], reg_map[src1], reg_map[src2]));
   1866 		}
   1867 
   1868 		if (op & SLJIT_SET_C)
   1869 			FAIL_IF(CMOVEQZ(TMP_EREG1, reg_map[dst], ULESS_FLAG));
   1870 
   1871 		FAIL_IF(SUB(reg_map[dst], reg_map[dst], ULESS_FLAG));
   1872 
   1873 		if (op & SLJIT_SET_C)
   1874 			FAIL_IF(ADD(ULESS_FLAG, TMP_EREG1, ZERO));
   1875 
   1876 		return SLJIT_SUCCESS;
   1877 
   1878 	case SLJIT_MUL:
   1879 		if (flags & SRC2_IMM) {
   1880 			FAIL_IF(load_immediate(compiler, TMP_REG2_mapped, src2));
   1881 			src2 = TMP_REG2;
   1882 			flags &= ~SRC2_IMM;
   1883 		}
   1884 
   1885 		FAIL_IF(MUL(reg_map[dst], reg_map[src1], reg_map[src2]));
   1886 
   1887 		return SLJIT_SUCCESS;
   1888 
   1889 #define EMIT_LOGICAL(op_imm, op_norm) \
   1890 	if (flags & SRC2_IMM) { \
   1891 		FAIL_IF(load_immediate(compiler, ADDR_TMP_mapped, src2)); \
   1892 		if (op & SLJIT_SET_E) \
   1893 			FAIL_IF(push_3_buffer( \
   1894 				compiler, op_norm, EQUAL_FLAG, reg_map[src1], \
   1895 				ADDR_TMP_mapped, __LINE__)); \
   1896 		if (CHECK_FLAGS(SLJIT_SET_E)) \
   1897 			FAIL_IF(push_3_buffer( \
   1898 				compiler, op_norm, reg_map[dst], reg_map[src1], \
   1899 				ADDR_TMP_mapped, __LINE__)); \
   1900 	} else { \
   1901 		if (op & SLJIT_SET_E) \
   1902 			FAIL_IF(push_3_buffer( \
   1903 				compiler, op_norm, EQUAL_FLAG, reg_map[src1], \
   1904 				reg_map[src2], __LINE__)); \
   1905 		if (CHECK_FLAGS(SLJIT_SET_E)) \
   1906 			FAIL_IF(push_3_buffer( \
   1907 				compiler, op_norm, reg_map[dst], reg_map[src1], \
   1908 				reg_map[src2], __LINE__)); \
   1909 	}
   1910 
   1911 	case SLJIT_AND:
   1912 		EMIT_LOGICAL(TILEGX_OPC_ANDI, TILEGX_OPC_AND);
   1913 		return SLJIT_SUCCESS;
   1914 
   1915 	case SLJIT_OR:
   1916 		EMIT_LOGICAL(TILEGX_OPC_ORI, TILEGX_OPC_OR);
   1917 		return SLJIT_SUCCESS;
   1918 
   1919 	case SLJIT_XOR:
   1920 		EMIT_LOGICAL(TILEGX_OPC_XORI, TILEGX_OPC_XOR);
   1921 		return SLJIT_SUCCESS;
   1922 
   1923 #define EMIT_SHIFT(op_imm, op_norm) \
   1924 	if (flags & SRC2_IMM) { \
   1925 		if (op & SLJIT_SET_E) \
   1926 			FAIL_IF(push_3_buffer( \
   1927 				compiler, op_imm, EQUAL_FLAG, reg_map[src1], \
   1928 				src2 & 0x3F, __LINE__)); \
   1929 		if (CHECK_FLAGS(SLJIT_SET_E)) \
   1930 			FAIL_IF(push_3_buffer( \
   1931 				compiler, op_imm, reg_map[dst], reg_map[src1], \
   1932 				src2 & 0x3F, __LINE__)); \
   1933 	} else { \
   1934 		if (op & SLJIT_SET_E) \
   1935 			FAIL_IF(push_3_buffer( \
   1936 				compiler, op_norm, EQUAL_FLAG, reg_map[src1], \
   1937 				reg_map[src2], __LINE__)); \
   1938 		if (CHECK_FLAGS(SLJIT_SET_E)) \
   1939 			FAIL_IF(push_3_buffer( \
   1940 				compiler, op_norm, reg_map[dst], reg_map[src1], \
   1941 				reg_map[src2], __LINE__)); \
   1942 	}
   1943 
   1944 	case SLJIT_SHL:
   1945 		EMIT_SHIFT(TILEGX_OPC_SHLI, TILEGX_OPC_SHL);
   1946 		return SLJIT_SUCCESS;
   1947 
   1948 	case SLJIT_LSHR:
   1949 		EMIT_SHIFT(TILEGX_OPC_SHRUI, TILEGX_OPC_SHRU);
   1950 		return SLJIT_SUCCESS;
   1951 
   1952 	case SLJIT_ASHR:
   1953 		EMIT_SHIFT(TILEGX_OPC_SHRSI, TILEGX_OPC_SHRS);
   1954 		return SLJIT_SUCCESS;
   1955 	}
   1956 
   1957 	SLJIT_UNREACHABLE();
   1958 	return SLJIT_SUCCESS;
   1959 }
   1960 
   1961 static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags, sljit_s32 dst, sljit_sw dstw, sljit_s32 src1, sljit_sw src1w, sljit_s32 src2, sljit_sw src2w)
   1962 {
   1963 	/* arg1 goes to TMP_REG1 or src reg.
   1964 	   arg2 goes to TMP_REG2, imm or src reg.
   1965 	   TMP_REG3 can be used for caching.
   1966 	   result goes to TMP_REG2, so put result can use TMP_REG1 and TMP_REG3. */
   1967 	sljit_s32 dst_r = TMP_REG2;
   1968 	sljit_s32 src1_r;
   1969 	sljit_sw src2_r = 0;
   1970 	sljit_s32 sugg_src2_r = TMP_REG2;
   1971 
   1972 	if (!(flags & ALT_KEEP_CACHE)) {
   1973 		compiler->cache_arg = 0;
   1974 		compiler->cache_argw = 0;
   1975 	}
   1976 
   1977 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
   1978 		if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32 && !(src2 & SLJIT_MEM))
   1979 			return SLJIT_SUCCESS;
   1980 		if (GET_FLAGS(op))
   1981 			flags |= UNUSED_DEST;
   1982 	} else if (FAST_IS_REG(dst)) {
   1983 		dst_r = dst;
   1984 		flags |= REG_DEST;
   1985 		if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
   1986 			sugg_src2_r = dst_r;
   1987 	} else if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, flags | ARG_TEST, TMP_REG1_mapped, dst, dstw))
   1988 		flags |= SLOW_DEST;
   1989 
   1990 	if (flags & IMM_OP) {
   1991 		if ((src2 & SLJIT_IMM) && src2w) {
   1992 			if ((!(flags & LOGICAL_OP)
   1993 					&& (src2w <= SIMM_16BIT_MAX && src2w >= SIMM_16BIT_MIN))
   1994 					|| ((flags & LOGICAL_OP) && !(src2w & ~UIMM_16BIT_MAX))) {
   1995 				flags |= SRC2_IMM;
   1996 				src2_r = src2w;
   1997 			}
   1998 		}
   1999 
   2000 		if (!(flags & SRC2_IMM) && (flags & CUMULATIVE_OP) && (src1 & SLJIT_IMM) && src1w) {
   2001 			if ((!(flags & LOGICAL_OP)
   2002 					&& (src1w <= SIMM_16BIT_MAX && src1w >= SIMM_16BIT_MIN))
   2003 					|| ((flags & LOGICAL_OP) && !(src1w & ~UIMM_16BIT_MAX))) {
   2004 				flags |= SRC2_IMM;
   2005 				src2_r = src1w;
   2006 
   2007 				/* And swap arguments. */
   2008 				src1 = src2;
   2009 				src1w = src2w;
   2010 				src2 = SLJIT_IMM;
   2011 				/* src2w = src2_r unneeded. */
   2012 			}
   2013 		}
   2014 	}
   2015 
   2016 	/* Source 1. */
   2017 	if (FAST_IS_REG(src1)) {
   2018 		src1_r = src1;
   2019 		flags |= REG1_SOURCE;
   2020 	} else if (src1 & SLJIT_IMM) {
   2021 		if (src1w) {
   2022 			FAIL_IF(load_immediate(compiler, TMP_REG1_mapped, src1w));
   2023 			src1_r = TMP_REG1;
   2024 		} else
   2025 			src1_r = 0;
   2026 	} else {
   2027 		if (getput_arg_fast(compiler, flags | LOAD_DATA, TMP_REG1_mapped, src1, src1w))
   2028 			FAIL_IF(compiler->error);
   2029 		else
   2030 			flags |= SLOW_SRC1;
   2031 		src1_r = TMP_REG1;
   2032 	}
   2033 
   2034 	/* Source 2. */
   2035 	if (FAST_IS_REG(src2)) {
   2036 		src2_r = src2;
   2037 		flags |= REG2_SOURCE;
   2038 		if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
   2039 			dst_r = src2_r;
   2040 	} else if (src2 & SLJIT_IMM) {
   2041 		if (!(flags & SRC2_IMM)) {
   2042 			if (src2w) {
   2043 				FAIL_IF(load_immediate(compiler, reg_map[sugg_src2_r], src2w));
   2044 				src2_r = sugg_src2_r;
   2045 			} else {
   2046 				src2_r = 0;
   2047 				if ((op >= SLJIT_MOV && op <= SLJIT_MOVU_S32) && (dst & SLJIT_MEM))
   2048 					dst_r = 0;
   2049 			}
   2050 		}
   2051 	} else {
   2052 		if (getput_arg_fast(compiler, flags | LOAD_DATA, reg_map[sugg_src2_r], src2, src2w))
   2053 			FAIL_IF(compiler->error);
   2054 		else
   2055 			flags |= SLOW_SRC2;
   2056 		src2_r = sugg_src2_r;
   2057 	}
   2058 
   2059 	if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
   2060 		SLJIT_ASSERT(src2_r == TMP_REG2);
   2061 		if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
   2062 			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG2_mapped, src2, src2w, src1, src1w));
   2063 			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1_mapped, src1, src1w, dst, dstw));
   2064 		} else {
   2065 			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1_mapped, src1, src1w, src2, src2w));
   2066 			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG2_mapped, src2, src2w, dst, dstw));
   2067 		}
   2068 	} else if (flags & SLOW_SRC1)
   2069 		FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1_mapped, src1, src1w, dst, dstw));
   2070 	else if (flags & SLOW_SRC2)
   2071 		FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, reg_map[sugg_src2_r], src2, src2w, dst, dstw));
   2072 
   2073 	FAIL_IF(emit_single_op(compiler, op, flags, dst_r, src1_r, src2_r));
   2074 
   2075 	if (dst & SLJIT_MEM) {
   2076 		if (!(flags & SLOW_DEST)) {
   2077 			getput_arg_fast(compiler, flags, reg_map[dst_r], dst, dstw);
   2078 			return compiler->error;
   2079 		}
   2080 
   2081 		return getput_arg(compiler, flags, reg_map[dst_r], dst, dstw, 0, 0);
   2082 	}
   2083 
   2084 	return SLJIT_SUCCESS;
   2085 }
   2086 
   2087 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw dstw, sljit_s32 src, sljit_sw srcw, sljit_s32 type)
   2088 {
   2089 	sljit_s32 sugg_dst_ar, dst_ar;
   2090 	sljit_s32 flags = GET_ALL_FLAGS(op);
   2091 	sljit_s32 mem_type = (op & SLJIT_I32_OP) ? (INT_DATA | SIGNED_DATA) : WORD_DATA;
   2092 
   2093 	CHECK_ERROR();
   2094 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
   2095 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2096 
   2097 	if (dst == SLJIT_UNUSED)
   2098 		return SLJIT_SUCCESS;
   2099 
   2100 	op = GET_OPCODE(op);
   2101 	if (op == SLJIT_MOV_S32 || op == SLJIT_MOV_U32)
   2102 		mem_type = INT_DATA | SIGNED_DATA;
   2103 	sugg_dst_ar = reg_map[(op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2];
   2104 
   2105 	compiler->cache_arg = 0;
   2106 	compiler->cache_argw = 0;
   2107 	if (op >= SLJIT_ADD && (src & SLJIT_MEM)) {
   2108 		ADJUST_LOCAL_OFFSET(src, srcw);
   2109 		FAIL_IF(emit_op_mem2(compiler, mem_type | LOAD_DATA, TMP_REG1_mapped, src, srcw, dst, dstw));
   2110 		src = TMP_REG1;
   2111 		srcw = 0;
   2112 	}
   2113 
   2114 	switch (type & 0xff) {
   2115 	case SLJIT_EQUAL:
   2116 	case SLJIT_NOT_EQUAL:
   2117 		FAIL_IF(CMPLTUI(sugg_dst_ar, EQUAL_FLAG, 1));
   2118 		dst_ar = sugg_dst_ar;
   2119 		break;
   2120 	case SLJIT_LESS:
   2121 	case SLJIT_GREATER_EQUAL:
   2122 		dst_ar = ULESS_FLAG;
   2123 		break;
   2124 	case SLJIT_GREATER:
   2125 	case SLJIT_LESS_EQUAL:
   2126 		dst_ar = UGREATER_FLAG;
   2127 		break;
   2128 	case SLJIT_SIG_LESS:
   2129 	case SLJIT_SIG_GREATER_EQUAL:
   2130 		dst_ar = LESS_FLAG;
   2131 		break;
   2132 	case SLJIT_SIG_GREATER:
   2133 	case SLJIT_SIG_LESS_EQUAL:
   2134 		dst_ar = GREATER_FLAG;
   2135 		break;
   2136 	case SLJIT_OVERFLOW:
   2137 	case SLJIT_NOT_OVERFLOW:
   2138 		dst_ar = OVERFLOW_FLAG;
   2139 		break;
   2140 	case SLJIT_MUL_OVERFLOW:
   2141 	case SLJIT_MUL_NOT_OVERFLOW:
   2142 		FAIL_IF(CMPLTUI(sugg_dst_ar, OVERFLOW_FLAG, 1));
   2143 		dst_ar = sugg_dst_ar;
   2144 		type ^= 0x1; /* Flip type bit for the XORI below. */
   2145 		break;
   2146 
   2147 	default:
   2148 		SLJIT_UNREACHABLE();
   2149 		dst_ar = sugg_dst_ar;
   2150 		break;
   2151 	}
   2152 
   2153 	if (type & 0x1) {
   2154 		FAIL_IF(XORI(sugg_dst_ar, dst_ar, 1));
   2155 		dst_ar = sugg_dst_ar;
   2156 	}
   2157 
   2158 	if (op >= SLJIT_ADD) {
   2159 		if (TMP_REG2_mapped != dst_ar)
   2160 			FAIL_IF(ADD(TMP_REG2_mapped, dst_ar, ZERO));
   2161 		return emit_op(compiler, op | flags, mem_type | CUMULATIVE_OP | LOGICAL_OP | IMM_OP | ALT_KEEP_CACHE, dst, dstw, src, srcw, TMP_REG2, 0);
   2162 	}
   2163 
   2164 	if (dst & SLJIT_MEM)
   2165 		return emit_op_mem(compiler, mem_type, dst_ar, dst, dstw);
   2166 
   2167 	if (sugg_dst_ar != dst_ar)
   2168 		return ADD(sugg_dst_ar, dst_ar, ZERO);
   2169 
   2170 	return SLJIT_SUCCESS;
   2171 }
   2172 
   2173 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op) {
   2174 	CHECK_ERROR();
   2175 	CHECK(check_sljit_emit_op0(compiler, op));
   2176 
   2177 	op = GET_OPCODE(op);
   2178 	switch (op) {
   2179 	case SLJIT_NOP:
   2180 		return push_0_buffer(compiler, TILEGX_OPC_FNOP, __LINE__);
   2181 
   2182 	case SLJIT_BREAKPOINT:
   2183 		return PI(BPT);
   2184 
   2185 	case SLJIT_LMUL_UW:
   2186 	case SLJIT_LMUL_SW:
   2187 	case SLJIT_DIVMOD_UW:
   2188 	case SLJIT_DIVMOD_SW:
   2189 	case SLJIT_DIV_UW:
   2190 	case SLJIT_DIV_SW:
   2191 		SLJIT_UNREACHABLE();
   2192 	}
   2193 
   2194 	return SLJIT_SUCCESS;
   2195 }
   2196 
   2197 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw dstw, sljit_s32 src, sljit_sw srcw)
   2198 {
   2199 	CHECK_ERROR();
   2200 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
   2201 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2202 	ADJUST_LOCAL_OFFSET(src, srcw);
   2203 
   2204 	switch (GET_OPCODE(op)) {
   2205 	case SLJIT_MOV:
   2206 	case SLJIT_MOV_P:
   2207 		return emit_op(compiler, SLJIT_MOV, WORD_DATA, dst, dstw, TMP_REG1, 0, src, srcw);
   2208 
   2209 	case SLJIT_MOV_U32:
   2210 		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA, dst, dstw, TMP_REG1, 0, src, srcw);
   2211 
   2212 	case SLJIT_MOV_S32:
   2213 		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA, dst, dstw, TMP_REG1, 0, src, srcw);
   2214 
   2215 	case SLJIT_MOV_U8:
   2216 		return emit_op(compiler, SLJIT_MOV_U8, BYTE_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8) srcw : srcw);
   2217 
   2218 	case SLJIT_MOV_S8:
   2219 		return emit_op(compiler, SLJIT_MOV_S8, BYTE_DATA | SIGNED_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8) srcw : srcw);
   2220 
   2221 	case SLJIT_MOV_U16:
   2222 		return emit_op(compiler, SLJIT_MOV_U16, HALF_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16) srcw : srcw);
   2223 
   2224 	case SLJIT_MOV_S16:
   2225 		return emit_op(compiler, SLJIT_MOV_S16, HALF_DATA | SIGNED_DATA, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16) srcw : srcw);
   2226 
   2227 	case SLJIT_MOVU:
   2228 	case SLJIT_MOVU_P:
   2229 		return emit_op(compiler, SLJIT_MOV, WORD_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
   2230 
   2231 	case SLJIT_MOVU_U32:
   2232 		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
   2233 
   2234 	case SLJIT_MOVU_S32:
   2235 		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, srcw);
   2236 
   2237 	case SLJIT_MOVU_U8:
   2238 		return emit_op(compiler, SLJIT_MOV_U8, BYTE_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8) srcw : srcw);
   2239 
   2240 	case SLJIT_MOVU_S8:
   2241 		return emit_op(compiler, SLJIT_MOV_S8, BYTE_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8) srcw : srcw);
   2242 
   2243 	case SLJIT_MOVU_U16:
   2244 		return emit_op(compiler, SLJIT_MOV_U16, HALF_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16) srcw : srcw);
   2245 
   2246 	case SLJIT_MOVU_S16:
   2247 		return emit_op(compiler, SLJIT_MOV_S16, HALF_DATA | SIGNED_DATA | WRITE_BACK, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16) srcw : srcw);
   2248 
   2249 	case SLJIT_NOT:
   2250 		return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src, srcw);
   2251 
   2252 	case SLJIT_NEG:
   2253 		return emit_op(compiler, SLJIT_SUB | GET_ALL_FLAGS(op), IMM_OP, dst, dstw, SLJIT_IMM, 0, src, srcw);
   2254 
   2255 	case SLJIT_CLZ:
   2256 		return emit_op(compiler, op, (op & SLJIT_I32_OP) ? INT_DATA : WORD_DATA, dst, dstw, TMP_REG1, 0, src, srcw);
   2257 	}
   2258 
   2259 	return SLJIT_SUCCESS;
   2260 }
   2261 
   2262 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw dstw, sljit_s32 src1, sljit_sw src1w, sljit_s32 src2, sljit_sw src2w)
   2263 {
   2264 	CHECK_ERROR();
   2265 	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
   2266 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2267 	ADJUST_LOCAL_OFFSET(src1, src1w);
   2268 	ADJUST_LOCAL_OFFSET(src2, src2w);
   2269 
   2270 	switch (GET_OPCODE(op)) {
   2271 	case SLJIT_ADD:
   2272 	case SLJIT_ADDC:
   2273 		return emit_op(compiler, op, CUMULATIVE_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
   2274 
   2275 	case SLJIT_SUB:
   2276 	case SLJIT_SUBC:
   2277 		return emit_op(compiler, op, IMM_OP, dst, dstw, src1, src1w, src2, src2w);
   2278 
   2279 	case SLJIT_MUL:
   2280 		return emit_op(compiler, op, CUMULATIVE_OP, dst, dstw, src1, src1w, src2, src2w);
   2281 
   2282 	case SLJIT_AND:
   2283 	case SLJIT_OR:
   2284 	case SLJIT_XOR:
   2285 		return emit_op(compiler, op, CUMULATIVE_OP | LOGICAL_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
   2286 
   2287 	case SLJIT_SHL:
   2288 	case SLJIT_LSHR:
   2289 	case SLJIT_ASHR:
   2290 		if (src2 & SLJIT_IMM)
   2291 			src2w &= 0x3f;
   2292 		if (op & SLJIT_I32_OP)
   2293 			src2w &= 0x1f;
   2294 
   2295 		return emit_op(compiler, op, IMM_OP, dst, dstw, src1, src1w, src2, src2w);
   2296 	}
   2297 
   2298 	return SLJIT_SUCCESS;
   2299 }
   2300 
   2301 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label * sljit_emit_label(struct sljit_compiler *compiler)
   2302 {
   2303 	struct sljit_label *label;
   2304 
   2305 	flush_buffer(compiler);
   2306 
   2307 	CHECK_ERROR_PTR();
   2308 	CHECK_PTR(check_sljit_emit_label(compiler));
   2309 
   2310 	if (compiler->last_label && compiler->last_label->size == compiler->size)
   2311 		return compiler->last_label;
   2312 
   2313 	label = (struct sljit_label *)ensure_abuf(compiler, sizeof(struct sljit_label));
   2314 	PTR_FAIL_IF(!label);
   2315 	set_label(label, compiler);
   2316 	return label;
   2317 }
   2318 
   2319 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
   2320 {
   2321 	sljit_s32 src_r = TMP_REG2;
   2322 	struct sljit_jump *jump = NULL;
   2323 
   2324 	flush_buffer(compiler);
   2325 
   2326 	CHECK_ERROR();
   2327 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
   2328 	ADJUST_LOCAL_OFFSET(src, srcw);
   2329 
   2330 	if (FAST_IS_REG(src)) {
   2331 		if (reg_map[src] != 0)
   2332 			src_r = src;
   2333 		else
   2334 			FAIL_IF(ADD_SOLO(TMP_REG2_mapped, reg_map[src], ZERO));
   2335 	}
   2336 
   2337 	if (type >= SLJIT_CALL0) {
   2338 		SLJIT_ASSERT(reg_map[PIC_ADDR_REG] == 16 && PIC_ADDR_REG == TMP_REG2);
   2339 		if (src & (SLJIT_IMM | SLJIT_MEM)) {
   2340 			if (src & SLJIT_IMM)
   2341 				FAIL_IF(emit_const(compiler, reg_map[PIC_ADDR_REG], srcw, 1));
   2342 			else {
   2343 				SLJIT_ASSERT(src_r == TMP_REG2 && (src & SLJIT_MEM));
   2344 				FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
   2345 			}
   2346 
   2347 			FAIL_IF(ADD_SOLO(0, reg_map[SLJIT_R0], ZERO));
   2348 
   2349 			FAIL_IF(ADDI_SOLO(54, 54, -16));
   2350 
   2351 			FAIL_IF(JALR_SOLO(reg_map[PIC_ADDR_REG]));
   2352 
   2353 			return ADDI_SOLO(54, 54, 16);
   2354 		}
   2355 
   2356 		/* Register input. */
   2357 		if (type >= SLJIT_CALL1)
   2358 			FAIL_IF(ADD_SOLO(0, reg_map[SLJIT_R0], ZERO));
   2359 
   2360 		FAIL_IF(ADD_SOLO(reg_map[PIC_ADDR_REG], reg_map[src_r], ZERO));
   2361 
   2362 		FAIL_IF(ADDI_SOLO(54, 54, -16));
   2363 
   2364 		FAIL_IF(JALR_SOLO(reg_map[src_r]));
   2365 
   2366 		return ADDI_SOLO(54, 54, 16);
   2367 	}
   2368 
   2369 	if (src & SLJIT_IMM) {
   2370 		jump = (struct sljit_jump *)ensure_abuf(compiler, sizeof(struct sljit_jump));
   2371 		FAIL_IF(!jump);
   2372 		set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_JAL : 0));
   2373 		jump->u.target = srcw;
   2374 		FAIL_IF(emit_const(compiler, TMP_REG2_mapped, 0, 1));
   2375 
   2376 		if (type >= SLJIT_FAST_CALL) {
   2377 			FAIL_IF(ADD_SOLO(ZERO, ZERO, ZERO));
   2378 			jump->addr = compiler->size;
   2379 			FAIL_IF(JR_SOLO(reg_map[src_r]));
   2380 		} else {
   2381 			jump->addr = compiler->size;
   2382 			FAIL_IF(JR_SOLO(reg_map[src_r]));
   2383 		}
   2384 
   2385 		return SLJIT_SUCCESS;
   2386 
   2387 	} else if (src & SLJIT_MEM) {
   2388 		FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
   2389 		flush_buffer(compiler);
   2390 	}
   2391 
   2392 	FAIL_IF(JR_SOLO(reg_map[src_r]));
   2393 
   2394 	if (jump)
   2395 		jump->addr = compiler->size;
   2396 
   2397 	return SLJIT_SUCCESS;
   2398 }
   2399 
   2400 #define BR_Z(src) \
   2401 	inst = BEQZ_X1 | SRCA_X1(src); \
   2402 	flags = IS_COND;
   2403 
   2404 #define BR_NZ(src) \
   2405 	inst = BNEZ_X1 | SRCA_X1(src); \
   2406 	flags = IS_COND;
   2407 
   2408 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump * sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
   2409 {
   2410 	struct sljit_jump *jump;
   2411 	sljit_ins inst;
   2412 	sljit_s32 flags = 0;
   2413 
   2414 	flush_buffer(compiler);
   2415 
   2416 	CHECK_ERROR_PTR();
   2417 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
   2418 
   2419 	jump = (struct sljit_jump *)ensure_abuf(compiler, sizeof(struct sljit_jump));
   2420 	PTR_FAIL_IF(!jump);
   2421 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
   2422 	type &= 0xff;
   2423 
   2424 	switch (type) {
   2425 	case SLJIT_EQUAL:
   2426 		BR_NZ(EQUAL_FLAG);
   2427 		break;
   2428 	case SLJIT_NOT_EQUAL:
   2429 		BR_Z(EQUAL_FLAG);
   2430 		break;
   2431 	case SLJIT_LESS:
   2432 		BR_Z(ULESS_FLAG);
   2433 		break;
   2434 	case SLJIT_GREATER_EQUAL:
   2435 		BR_NZ(ULESS_FLAG);
   2436 		break;
   2437 	case SLJIT_GREATER:
   2438 		BR_Z(UGREATER_FLAG);
   2439 		break;
   2440 	case SLJIT_LESS_EQUAL:
   2441 		BR_NZ(UGREATER_FLAG);
   2442 		break;
   2443 	case SLJIT_SIG_LESS:
   2444 		BR_Z(LESS_FLAG);
   2445 		break;
   2446 	case SLJIT_SIG_GREATER_EQUAL:
   2447 		BR_NZ(LESS_FLAG);
   2448 		break;
   2449 	case SLJIT_SIG_GREATER:
   2450 		BR_Z(GREATER_FLAG);
   2451 		break;
   2452 	case SLJIT_SIG_LESS_EQUAL:
   2453 		BR_NZ(GREATER_FLAG);
   2454 		break;
   2455 	case SLJIT_OVERFLOW:
   2456 	case SLJIT_MUL_OVERFLOW:
   2457 		BR_Z(OVERFLOW_FLAG);
   2458 		break;
   2459 	case SLJIT_NOT_OVERFLOW:
   2460 	case SLJIT_MUL_NOT_OVERFLOW:
   2461 		BR_NZ(OVERFLOW_FLAG);
   2462 		break;
   2463 	default:
   2464 		/* Not conditional branch. */
   2465 		inst = 0;
   2466 		break;
   2467 	}
   2468 
   2469 	jump->flags |= flags;
   2470 
   2471 	if (inst) {
   2472 		inst = inst | ((type <= SLJIT_JUMP) ? BOFF_X1(5) : BOFF_X1(6));
   2473 		PTR_FAIL_IF(PI(inst));
   2474 	}
   2475 
   2476 	PTR_FAIL_IF(emit_const(compiler, TMP_REG2_mapped, 0, 1));
   2477 	if (type <= SLJIT_JUMP) {
   2478 		jump->addr = compiler->size;
   2479 		PTR_FAIL_IF(JR_SOLO(TMP_REG2_mapped));
   2480 	} else {
   2481 		SLJIT_ASSERT(reg_map[PIC_ADDR_REG] == 16 && PIC_ADDR_REG == TMP_REG2);
   2482 		/* Cannot be optimized out if type is >= CALL0. */
   2483 		jump->flags |= IS_JAL | (type >= SLJIT_CALL0 ? SLJIT_REWRITABLE_JUMP : 0);
   2484 		PTR_FAIL_IF(ADD_SOLO(0, reg_map[SLJIT_R0], ZERO));
   2485 		jump->addr = compiler->size;
   2486 		PTR_FAIL_IF(JALR_SOLO(TMP_REG2_mapped));
   2487 	}
   2488 
   2489 	return jump;
   2490 }
   2491 
   2492 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
   2493 {
   2494 	return 0;
   2495 }
   2496 
   2497 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw dstw, sljit_s32 src, sljit_sw srcw)
   2498 {
   2499 	SLJIT_UNREACHABLE();
   2500 }
   2501 
   2502 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw dstw, sljit_s32 src1, sljit_sw src1w, sljit_s32 src2, sljit_sw src2w)
   2503 {
   2504 	SLJIT_UNREACHABLE();
   2505 }
   2506 
   2507 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const * sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
   2508 {
   2509 	struct sljit_const *const_;
   2510 	sljit_s32 reg;
   2511 
   2512 	flush_buffer(compiler);
   2513 
   2514 	CHECK_ERROR_PTR();
   2515 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
   2516 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2517 
   2518 	const_ = (struct sljit_const *)ensure_abuf(compiler, sizeof(struct sljit_const));
   2519 	PTR_FAIL_IF(!const_);
   2520 	set_const(const_, compiler);
   2521 
   2522 	reg = FAST_IS_REG(dst) ? dst : TMP_REG2;
   2523 
   2524 	PTR_FAIL_IF(emit_const_64(compiler, reg, init_value, 1));
   2525 
   2526 	if (dst & SLJIT_MEM)
   2527 		PTR_FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, dst, dstw, TMP_REG1, 0, TMP_REG2, 0));
   2528 	return const_;
   2529 }
   2530 
   2531 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target)
   2532 {
   2533 	sljit_ins *inst = (sljit_ins *)addr;
   2534 
   2535 	inst[0] = (inst[0] & ~(0xFFFFL << 43)) | (((new_target >> 32) & 0xffff) << 43);
   2536 	inst[1] = (inst[1] & ~(0xFFFFL << 43)) | (((new_target >> 16) & 0xffff) << 43);
   2537 	inst[2] = (inst[2] & ~(0xFFFFL << 43)) | ((new_target & 0xffff) << 43);
   2538 	SLJIT_CACHE_FLUSH(inst, inst + 3);
   2539 }
   2540 
   2541 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
   2542 {
   2543 	sljit_ins *inst = (sljit_ins *)addr;
   2544 
   2545 	inst[0] = (inst[0] & ~(0xFFFFL << 43)) | (((new_constant >> 48) & 0xFFFFL) << 43);
   2546 	inst[1] = (inst[1] & ~(0xFFFFL << 43)) | (((new_constant >> 32) & 0xFFFFL) << 43);
   2547 	inst[2] = (inst[2] & ~(0xFFFFL << 43)) | (((new_constant >> 16) & 0xFFFFL) << 43);
   2548 	inst[3] = (inst[3] & ~(0xFFFFL << 43)) | ((new_constant & 0xFFFFL) << 43);
   2549 	SLJIT_CACHE_FLUSH(inst, inst + 4);
   2550 }
   2551 
   2552 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
   2553 {
   2554 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
   2555 	return reg_map[reg];
   2556 }
   2557 
   2558 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
   2559 	void *instruction, sljit_s32 size)
   2560 {
   2561 	CHECK_ERROR();
   2562 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
   2563 	return SLJIT_ERR_UNSUPPORTED;
   2564 }
   2565 
   2566