Home | History | Annotate | Line # | Download | only in amdkfd
      1 /*
      2  * Copyright 2018 Advanced Micro Devices, Inc.
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 var SQ_WAVE_STATUS_INST_ATC_SHIFT		= 23
     24 var SQ_WAVE_STATUS_INST_ATC_MASK		= 0x00800000
     25 var SQ_WAVE_STATUS_SPI_PRIO_MASK		= 0x00000006
     26 var SQ_WAVE_STATUS_HALT_MASK			= 0x2000
     27 
     28 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT		= 12
     29 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
     30 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT		= 8
     31 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE		= 6
     32 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT		= 24
     33 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE		= 4
     34 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT	= 24
     35 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE	= 4
     36 var SQ_WAVE_IB_STS2_WAVE64_SHIFT		= 11
     37 var SQ_WAVE_IB_STS2_WAVE64_SIZE			= 1
     38 
     39 var SQ_WAVE_TRAPSTS_SAVECTX_MASK		= 0x400
     40 var SQ_WAVE_TRAPSTS_EXCE_MASK			= 0x1FF
     41 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT		= 10
     42 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK		= 0x100
     43 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT		= 8
     44 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK		= 0x3FF
     45 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT		= 0x0
     46 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE		= 10
     47 var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK		= 0xFFFFF800
     48 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT		= 11
     49 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE		= 21
     50 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK		= 0x800
     51 
     52 var SQ_WAVE_IB_STS_RCNT_SHIFT			= 16
     53 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT		= 15
     54 var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT		= 25
     55 var SQ_WAVE_IB_STS_REPLAY_W64H_SIZE		= 1
     56 var SQ_WAVE_IB_STS_REPLAY_W64H_MASK		= 0x02000000
     57 var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE		= 1
     58 var SQ_WAVE_IB_STS_RCNT_SIZE			= 6
     59 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK	= 0x003F8000
     60 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF
     61 
     62 var SQ_BUF_RSRC_WORD1_ATC_SHIFT			= 24
     63 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT		= 27
     64 
     65 // bits [31:24] unused by SPI debug data
     66 var TTMP11_SAVE_REPLAY_W64H_SHIFT		= 31
     67 var TTMP11_SAVE_REPLAY_W64H_MASK		= 0x80000000
     68 var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT		= 24
     69 var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK		= 0x7F000000
     70 
     71 // SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
     72 // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
     73 var S_SAVE_BUF_RSRC_WORD1_STRIDE		= 0x00040000
     74 var S_SAVE_BUF_RSRC_WORD3_MISC			= 0x10807FAC
     75 
     76 var S_SAVE_SPI_INIT_ATC_MASK			= 0x08000000
     77 var S_SAVE_SPI_INIT_ATC_SHIFT			= 27
     78 var S_SAVE_SPI_INIT_MTYPE_MASK			= 0x70000000
     79 var S_SAVE_SPI_INIT_MTYPE_SHIFT			= 28
     80 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
     81 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
     82 
     83 var S_SAVE_PC_HI_RCNT_SHIFT			= 26
     84 var S_SAVE_PC_HI_RCNT_MASK			= 0xFC000000
     85 var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT		= 25
     86 var S_SAVE_PC_HI_FIRST_REPLAY_MASK		= 0x02000000
     87 var S_SAVE_PC_HI_REPLAY_W64H_SHIFT		= 24
     88 var S_SAVE_PC_HI_REPLAY_W64H_MASK		= 0x01000000
     89 
     90 var s_sgpr_save_num				= 108
     91 
     92 var s_save_spi_init_lo				= exec_lo
     93 var s_save_spi_init_hi				= exec_hi
     94 var s_save_pc_lo				= ttmp0
     95 var s_save_pc_hi				= ttmp1
     96 var s_save_exec_lo				= ttmp2
     97 var s_save_exec_hi				= ttmp3
     98 var s_save_status				= ttmp12
     99 var s_save_trapsts				= ttmp5
    100 var s_save_xnack_mask				= ttmp6
    101 var s_wave_size					= ttmp7
    102 var s_save_buf_rsrc0				= ttmp8
    103 var s_save_buf_rsrc1				= ttmp9
    104 var s_save_buf_rsrc2				= ttmp10
    105 var s_save_buf_rsrc3				= ttmp11
    106 var s_save_mem_offset				= ttmp14
    107 var s_save_alloc_size				= s_save_trapsts
    108 var s_save_tmp					= s_save_buf_rsrc2
    109 var s_save_m0					= ttmp15
    110 
    111 var S_RESTORE_BUF_RSRC_WORD1_STRIDE		= S_SAVE_BUF_RSRC_WORD1_STRIDE
    112 var S_RESTORE_BUF_RSRC_WORD3_MISC		= S_SAVE_BUF_RSRC_WORD3_MISC
    113 
    114 var S_RESTORE_SPI_INIT_ATC_MASK			= 0x08000000
    115 var S_RESTORE_SPI_INIT_ATC_SHIFT		= 27
    116 var S_RESTORE_SPI_INIT_MTYPE_MASK		= 0x70000000
    117 var S_RESTORE_SPI_INIT_MTYPE_SHIFT		= 28
    118 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
    119 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
    120 var S_WAVE_SIZE					= 25
    121 
    122 var S_RESTORE_PC_HI_RCNT_SHIFT			= S_SAVE_PC_HI_RCNT_SHIFT
    123 var S_RESTORE_PC_HI_RCNT_MASK			= S_SAVE_PC_HI_RCNT_MASK
    124 var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT		= S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
    125 var S_RESTORE_PC_HI_FIRST_REPLAY_MASK		= S_SAVE_PC_HI_FIRST_REPLAY_MASK
    126 
    127 var s_restore_spi_init_lo			= exec_lo
    128 var s_restore_spi_init_hi			= exec_hi
    129 var s_restore_mem_offset			= ttmp12
    130 var s_restore_alloc_size			= ttmp3
    131 var s_restore_tmp				= ttmp6
    132 var s_restore_mem_offset_save			= s_restore_tmp
    133 var s_restore_m0				= s_restore_alloc_size
    134 var s_restore_mode				= ttmp7
    135 var s_restore_flat_scratch			= ttmp2
    136 var s_restore_pc_lo				= ttmp0
    137 var s_restore_pc_hi				= ttmp1
    138 var s_restore_exec_lo				= ttmp14
    139 var s_restore_exec_hi				= ttmp15
    140 var s_restore_status				= ttmp4
    141 var s_restore_trapsts				= ttmp5
    142 var s_restore_xnack_mask			= ttmp13
    143 var s_restore_buf_rsrc0				= ttmp8
    144 var s_restore_buf_rsrc1				= ttmp9
    145 var s_restore_buf_rsrc2				= ttmp10
    146 var s_restore_buf_rsrc3				= ttmp11
    147 var s_restore_size				= ttmp7
    148 
    149 shader main
    150 	asic(DEFAULT)
    151 	type(CS)
    152 	wave_size(32)
    153 
    154 	s_branch	L_SKIP_RESTORE						//NOT restore. might be a regular trap or save
    155 
    156 L_JUMP_TO_RESTORE:
    157 	s_branch	L_RESTORE
    158 
    159 L_SKIP_RESTORE:
    160 	s_getreg_b32	s_save_status, hwreg(HW_REG_STATUS)			//save STATUS since we will change SCC
    161 	s_andn2_b32	s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK
    162 	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
    163 	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK	//check whether this is for save
    164 	s_cbranch_scc1	L_SAVE
    165 
    166 	// If STATUS.MEM_VIOL is asserted then halt the wave to prevent
    167 	// the exception raising again and blocking context save.
    168 	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
    169 	s_cbranch_scc0	L_FETCH_2ND_TRAP
    170 	s_or_b32	s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
    171 
    172 L_FETCH_2ND_TRAP:
    173 	// Preserve and clear scalar XNACK state before issuing scalar loads.
    174 	// Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
    175 	// unused space ttmp11[31:24].
    176 	s_andn2_b32	ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
    177 	s_getreg_b32	ttmp2, hwreg(HW_REG_IB_STS)
    178 	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
    179 	s_lshl_b32	ttmp3, ttmp3, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
    180 	s_or_b32	ttmp11, ttmp11, ttmp3
    181 	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
    182 	s_lshl_b32	ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
    183 	s_or_b32	ttmp11, ttmp11, ttmp3
    184 	s_andn2_b32	ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
    185 	s_setreg_b32	hwreg(HW_REG_IB_STS), ttmp2
    186 
    187 	// Read second-level TBA/TMA from first-level TMA and jump if available.
    188 	// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
    189 	// ttmp12 holds SQ_WAVE_STATUS
    190 	s_getreg_b32	ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
    191 	s_getreg_b32	ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
    192 	s_lshl_b64	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
    193 	s_load_dwordx2	[ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1		// second-level TBA
    194 	s_waitcnt	lgkmcnt(0)
    195 	s_load_dwordx2	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1		// second-level TMA
    196 	s_waitcnt	lgkmcnt(0)
    197 	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
    198 	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set
    199 	s_setpc_b64	[ttmp2, ttmp3]						// jump to second-level trap handler
    200 
    201 L_NO_NEXT_TRAP:
    202 	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
    203 	s_and_b32	s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK
    204 	s_cbranch_scc1	L_EXCP_CASE						// Exception, jump back to the shader program directly.
    205 	s_add_u32	ttmp0, ttmp0, 4						// S_TRAP case, add 4 to ttmp0
    206 	s_addc_u32	ttmp1, ttmp1, 0
    207 L_EXCP_CASE:
    208 	s_and_b32	ttmp1, ttmp1, 0xFFFF
    209 
    210 	// Restore SQ_WAVE_IB_STS.
    211 	s_lshr_b32	ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
    212 	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
    213 	s_lshr_b32	ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
    214 	s_and_b32	ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
    215 	s_or_b32	ttmp2, ttmp2, ttmp3
    216 	s_setreg_b32	hwreg(HW_REG_IB_STS), ttmp2
    217 
    218 	// Restore SQ_WAVE_STATUS.
    219 	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
    220 	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
    221 	s_setreg_b32	hwreg(HW_REG_STATUS), s_save_status
    222 
    223 	s_rfe_b64	[ttmp0, ttmp1]
    224 
    225 L_SAVE:
    226 	//check whether there is mem_viol
    227 	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
    228 	s_and_b32	s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
    229 	s_cbranch_scc0	L_NO_PC_REWIND
    230 
    231 	//if so, need rewind PC assuming GDS operation gets NACKed
    232 	s_mov_b32	s_save_tmp, 0
    233 	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp	//clear mem_viol bit
    234 	s_and_b32	s_save_pc_hi, s_save_pc_hi, 0x0000ffff			//pc[47:32]
    235 	s_sub_u32	s_save_pc_lo, s_save_pc_lo, 8				//pc[31:0]-8
    236 	s_subb_u32	s_save_pc_hi, s_save_pc_hi, 0x0
    237 
    238 L_NO_PC_REWIND:
    239 	s_mov_b32	s_save_tmp, 0
    240 	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp	//clear saveCtx bit
    241 
    242 	s_getreg_b32	s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
    243 	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)
    244 	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
    245 	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
    246 	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)
    247 	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
    248 	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
    249 	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, SQ_WAVE_IB_STS_REPLAY_W64H_SIZE)
    250 	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
    251 	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
    252 	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS)			//clear RCNT and FIRST_REPLAY and REPLAY_W64H in IB_STS
    253 	s_and_b32	s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
    254 
    255 	s_setreg_b32	hwreg(HW_REG_IB_STS), s_save_tmp
    256 
    257 	/* inform SPI the readiness and wait for SPI's go signal */
    258 	s_mov_b32	s_save_exec_lo, exec_lo					//save EXEC and use EXEC for the go signal from SPI
    259 	s_mov_b32	s_save_exec_hi, exec_hi
    260 	s_mov_b64	exec, 0x0						//clear EXEC to get ready to receive
    261 
    262 	s_sendmsg	sendmsg(MSG_SAVEWAVE)					//send SPI a message and wait for SPI's write to EXEC
    263 
    264 L_SLEEP:
    265 	// sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
    266 	// SQ hang, since the 7,8th wave could not get arbit to exec inst, while
    267 	// other waves are stuck into the sleep-loop and waiting for wrexec!=0
    268 	s_sleep		0x2
    269 	s_cbranch_execz	L_SLEEP
    270 
    271 	/* setup Resource Contants */
    272 	s_mov_b32	s_save_buf_rsrc0, s_save_spi_init_lo			//base_addr_lo
    273 	s_and_b32	s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF	//base_addr_hi
    274 	s_or_b32	s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
    275 	s_mov_b32	s_save_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
    276 	s_mov_b32	s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
    277 	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
    278 	s_lshr_b32	s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
    279 	s_or_b32	s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp		//or ATC
    280 	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
    281 	s_lshr_b32	s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
    282 	s_or_b32	s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp		//or MTYPE
    283 
    284 	s_mov_b32	s_save_m0, m0
    285 
    286 	/* global mem offset */
    287 	s_mov_b32	s_save_mem_offset, 0x0
    288 	s_getreg_b32	s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
    289 	s_lshl_b32	s_wave_size, s_wave_size, S_WAVE_SIZE
    290 	s_or_b32	s_wave_size, s_save_spi_init_hi, s_wave_size		//share s_wave_size with exec_hi, it's at bit25
    291 
    292 	/* save HW registers */
    293 
    294 L_SAVE_HWREG:
    295 	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
    296 	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
    297 	get_svgpr_size_bytes(s_save_tmp)
    298 	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
    299 	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
    300 
    301 	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    302 
    303 	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
    304 	write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
    305 	write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
    306 	write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
    307 	write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
    308 	write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
    309 
    310 	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
    311 	write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)
    312 	write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
    313 
    314 	s_getreg_b32	s_save_m0, hwreg(HW_REG_MODE)
    315 	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
    316 
    317 	s_getreg_b32	s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO)
    318 	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
    319 
    320 	s_getreg_b32	s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI)
    321 	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
    322 
    323 	/* the first wave in the threadgroup */
    324 	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
    325 	s_mov_b32	s_save_exec_hi, 0x0
    326 	s_or_b32	s_save_exec_hi, s_save_tmp, s_save_exec_hi		// save first wave bit to s_save_exec_hi.bits[26]
    327 
    328 	/* save SGPRs */
    329 	// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
    330 
    331 	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
    332 	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
    333 	get_svgpr_size_bytes(s_save_tmp)
    334 	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
    335 	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    336 
    337 	// backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
    338 	s_mov_b32	s_save_xnack_mask, s_save_buf_rsrc0
    339 	s_add_u32	s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
    340 	s_addc_u32	s_save_buf_rsrc1, s_save_buf_rsrc1, 0
    341 
    342 	s_mov_b32	m0, 0x0							//SGPR initial index value =0
    343 	s_nop		0x0							//Manually inserted wait states
    344 L_SAVE_SGPR_LOOP:
    345 	// SGPR is allocated in 16 SGPR granularity
    346 	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
    347 	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
    348 	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
    349 	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
    350 	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
    351 	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
    352 	s_movrels_b64	s12, s12						//s12 = s[12+m0], s13 = s[13+m0]
    353 	s_movrels_b64	s14, s14						//s14 = s[14+m0], s15 = s[15+m0]
    354 
    355 	write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
    356 	s_add_u32	m0, m0, 16						//next sgpr index
    357 	s_cmp_lt_u32	m0, 96							//scc = (m0 < first 96 SGPR) ? 1 : 0
    358 	s_cbranch_scc1	L_SAVE_SGPR_LOOP					//first 96 SGPR save is complete?
    359 
    360 	//save the rest 12 SGPR
    361 	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
    362 	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
    363 	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
    364 	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
    365 	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
    366 	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
    367 	write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
    368 
    369 	// restore s_save_buf_rsrc0,1
    370 	s_mov_b32	s_save_buf_rsrc0, s_save_xnack_mask
    371 
    372 	/* save first 4 VGPR, then LDS save could use   */
    373 	// each wave will alloc 4 vgprs at least...
    374 
    375 	s_mov_b32	s_save_mem_offset, 0
    376  	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
    377 	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
    378 	s_and_b32	m0, m0, 1
    379 	s_cmp_eq_u32	m0, 1
    380 	s_cbranch_scc1	L_ENABLE_SAVE_4VGPR_EXEC_HI
    381 	s_mov_b32	exec_hi, 0x00000000
    382 	s_branch	L_SAVE_4VGPR_WAVE32
    383 L_ENABLE_SAVE_4VGPR_EXEC_HI:
    384 	s_mov_b32	exec_hi, 0xFFFFFFFF
    385 	s_branch	L_SAVE_4VGPR_WAVE64
    386 L_SAVE_4VGPR_WAVE32:
    387 	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    388 
    389 	// VGPR Allocated in 4-GPR granularity
    390 
    391 	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
    392 	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
    393 	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
    394 	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
    395 	s_branch	L_SAVE_LDS
    396 
    397 L_SAVE_4VGPR_WAVE64:
    398 	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    399 
    400 	// VGPR Allocated in 4-GPR granularity
    401 
    402 	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
    403 	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
    404 	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
    405 	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
    406 
    407 	/* save LDS */
    408 
    409 L_SAVE_LDS:
    410 	// Change EXEC to all threads...
    411 	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
    412 	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
    413 	s_and_b32	m0, m0, 1
    414 	s_cmp_eq_u32	m0, 1
    415 	s_cbranch_scc1	L_ENABLE_SAVE_LDS_EXEC_HI
    416 	s_mov_b32	exec_hi, 0x00000000
    417 	s_branch	L_SAVE_LDS_NORMAL
    418 L_ENABLE_SAVE_LDS_EXEC_HI:
    419 	s_mov_b32	exec_hi, 0xFFFFFFFF
    420 L_SAVE_LDS_NORMAL:
    421 	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
    422 	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//lds_size is zero?
    423 	s_cbranch_scc0	L_SAVE_LDS_DONE						//no lds used? jump to L_SAVE_DONE
    424 
    425 	s_barrier								//LDS is used? wait for other waves in the same TG
    426 	s_and_b32	s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
    427 	s_cbranch_scc0	L_SAVE_LDS_DONE
    428 
    429 	// first wave do LDS save;
    430 
    431 	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 6			//LDS size in dwords = lds_size * 64dw
    432 	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//LDS size in bytes
    433 	s_mov_b32	s_save_buf_rsrc2, s_save_alloc_size			//NUM_RECORDS in bytes
    434 
    435 	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
    436 	//
    437 	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
    438 	get_svgpr_size_bytes(s_save_tmp)
    439 	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
    440 	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
    441 	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
    442 
    443 	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    444 
    445 	//load 0~63*4(byte address) to vgpr v0
    446 	v_mbcnt_lo_u32_b32	v0, -1, 0
    447 	v_mbcnt_hi_u32_b32	v0, -1, v0
    448 	v_mul_u32_u24	v0, 4, v0
    449 
    450 	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
    451 	s_and_b32	m0, m0, 1
    452 	s_cmp_eq_u32	m0, 1
    453 	s_mov_b32	m0, 0x0
    454 	s_cbranch_scc1	L_SAVE_LDS_W64
    455 
    456 L_SAVE_LDS_W32:
    457 	s_mov_b32	s3, 128
    458 	s_nop		0
    459 	s_nop		0
    460 	s_nop		0
    461 L_SAVE_LDS_LOOP_W32:
    462 	ds_read_b32	v1, v0
    463 	s_waitcnt	0
    464 	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
    465 
    466 	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
    467 	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
    468 	v_add_nc_u32	v0, v0, 128						//mem offset increased by 128 bytes
    469 	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
    470 	s_cbranch_scc1	L_SAVE_LDS_LOOP_W32					//LDS save is complete?
    471 
    472 	s_branch	L_SAVE_LDS_DONE
    473 
    474 L_SAVE_LDS_W64:
    475 	s_mov_b32	s3, 256
    476 	s_nop		0
    477 	s_nop		0
    478 	s_nop		0
    479 L_SAVE_LDS_LOOP_W64:
    480 	ds_read_b32	v1, v0
    481 	s_waitcnt	0
    482 	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
    483 
    484 	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
    485 	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
    486 	v_add_nc_u32	v0, v0, 256						//mem offset increased by 256 bytes
    487 	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
    488 	s_cbranch_scc1	L_SAVE_LDS_LOOP_W64					//LDS save is complete?
    489 
    490 L_SAVE_LDS_DONE:
    491 	/* save VGPRs  - set the Rest VGPRs */
    492 L_SAVE_VGPR:
    493 	// VGPR SR memory offset: 0
    494 	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
    495 	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
    496 	s_and_b32	m0, m0, 1
    497 	s_cmp_eq_u32	m0, 1
    498 	s_cbranch_scc1	L_ENABLE_SAVE_VGPR_EXEC_HI
    499 	s_mov_b32	s_save_mem_offset, (0+128*4)				// for the rest VGPRs
    500 	s_mov_b32	exec_hi, 0x00000000
    501 	s_branch	L_SAVE_VGPR_NORMAL
    502 L_ENABLE_SAVE_VGPR_EXEC_HI:
    503 	s_mov_b32	s_save_mem_offset, (0+256*4)				// for the rest VGPRs
    504 	s_mov_b32	exec_hi, 0xFFFFFFFF
    505 L_SAVE_VGPR_NORMAL:
    506 	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
    507 	s_add_u32	s_save_alloc_size, s_save_alloc_size, 1
    508 	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
    509 	//determine it is wave32 or wave64
    510 	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
    511 	s_and_b32	m0, m0, 1
    512 	s_cmp_eq_u32	m0, 1
    513 	s_cbranch_scc1	L_SAVE_VGPR_WAVE64
    514 
    515 	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    516 
    517 	// VGPR Allocated in 4-GPR granularity
    518 
    519 	// VGPR store using dw burst
    520 	s_mov_b32	m0, 0x4							//VGPR initial index value =4
    521 	s_cmp_lt_u32	m0, s_save_alloc_size
    522 	s_cbranch_scc0	L_SAVE_VGPR_END
    523 
    524 L_SAVE_VGPR_W32_LOOP:
    525 	v_movrels_b32	v0, v0							//v0 = v[0+m0]
    526 	v_movrels_b32	v1, v1							//v1 = v[1+m0]
    527 	v_movrels_b32	v2, v2							//v2 = v[2+m0]
    528 	v_movrels_b32	v3, v3							//v3 = v[3+m0]
    529 
    530 	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
    531 	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
    532 	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
    533 	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
    534 
    535 	s_add_u32	m0, m0, 4						//next vgpr index
    536 	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128*4		//every buffer_store_dword does 128 bytes
    537 	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
    538 	s_cbranch_scc1	L_SAVE_VGPR_W32_LOOP					//VGPR save is complete?
    539 
    540 	s_branch	L_SAVE_VGPR_END
    541 
    542 L_SAVE_VGPR_WAVE64:
    543 	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    544 
    545 	// VGPR store using dw burst
    546 	s_mov_b32	m0, 0x4							//VGPR initial index value =4
    547 	s_cmp_lt_u32	m0, s_save_alloc_size
    548 	s_cbranch_scc0	L_SAVE_VGPR_END
    549 
    550 L_SAVE_VGPR_W64_LOOP:
    551 	v_movrels_b32	v0, v0							//v0 = v[0+m0]
    552 	v_movrels_b32	v1, v1							//v1 = v[1+m0]
    553 	v_movrels_b32	v2, v2							//v2 = v[2+m0]
    554 	v_movrels_b32	v3, v3							//v3 = v[3+m0]
    555 
    556 	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
    557 	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
    558 	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
    559 	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
    560 
    561 	s_add_u32	m0, m0, 4						//next vgpr index
    562 	s_add_u32	s_save_mem_offset, s_save_mem_offset, 256*4		//every buffer_store_dword does 256 bytes
    563 	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
    564 	s_cbranch_scc1	L_SAVE_VGPR_W64_LOOP					//VGPR save is complete?
    565 
    566 	//Below part will be the save shared vgpr part (new for gfx10)
    567 	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
    568 	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
    569 	s_cbranch_scc0	L_SAVE_VGPR_END						//no shared_vgpr used? jump to L_SAVE_LDS
    570 	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 3			//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
    571 	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
    572 	//save shared_vgpr will start from the index of m0
    573 	s_add_u32	s_save_alloc_size, s_save_alloc_size, m0
    574 	s_mov_b32	exec_lo, 0xFFFFFFFF
    575 	s_mov_b32	exec_hi, 0x00000000
    576 L_SAVE_SHARED_VGPR_WAVE64_LOOP:
    577 	v_movrels_b32	v0, v0							//v0 = v[0+m0]
    578 	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
    579 	s_add_u32	m0, m0, 1						//next vgpr index
    580 	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128
    581 	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
    582 	s_cbranch_scc1	L_SAVE_SHARED_VGPR_WAVE64_LOOP				//SHARED_VGPR save is complete?
    583 
    584 L_SAVE_VGPR_END:
    585 	s_branch	L_END_PGM
    586 
    587 L_RESTORE:
    588 	/* Setup Resource Contants */
    589 	s_mov_b32	s_restore_buf_rsrc0, s_restore_spi_init_lo		//base_addr_lo
    590 	s_and_b32	s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF	//base_addr_hi
    591 	s_or_b32	s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
    592 	s_mov_b32	s_restore_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes)
    593 	s_mov_b32	s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
    594 	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
    595 	s_lshr_b32	s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
    596 	s_or_b32	s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp	//or ATC
    597 	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
    598 	s_lshr_b32	s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
    599 	s_or_b32	s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp	//or MTYPE
    600 	//determine it is wave32 or wave64
    601 	s_getreg_b32	s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
    602 	s_lshl_b32	s_restore_size, s_restore_size, S_WAVE_SIZE
    603 	s_or_b32	s_restore_size, s_restore_spi_init_hi, s_restore_size
    604 
    605 	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
    606 	s_cbranch_scc0	L_RESTORE_VGPR
    607 
    608 	/* restore LDS */
    609 L_RESTORE_LDS:
    610 	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
    611 	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
    612 	s_and_b32	m0, m0, 1
    613 	s_cmp_eq_u32	m0, 1
    614 	s_cbranch_scc1	L_ENABLE_RESTORE_LDS_EXEC_HI
    615 	s_mov_b32	exec_hi, 0x00000000
    616 	s_branch	L_RESTORE_LDS_NORMAL
    617 L_ENABLE_RESTORE_LDS_EXEC_HI:
    618 	s_mov_b32	exec_hi, 0xFFFFFFFF
    619 L_RESTORE_LDS_NORMAL:
    620 	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
    621 	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//lds_size is zero?
    622 	s_cbranch_scc0	L_RESTORE_VGPR						//no lds used? jump to L_RESTORE_VGPR
    623 	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 6		//LDS size in dwords = lds_size * 64dw
    624 	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//LDS size in bytes
    625 	s_mov_b32	s_restore_buf_rsrc2, s_restore_alloc_size		//NUM_RECORDS in bytes
    626 
    627 	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
    628 	//
    629 	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
    630 	get_svgpr_size_bytes(s_restore_tmp)
    631 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
    632 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
    633 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
    634 
    635 	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    636 
    637 	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
    638 	s_and_b32	m0, m0, 1
    639 	s_cmp_eq_u32	m0, 1
    640 	s_mov_b32	m0, 0x0
    641 	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64
    642 
    643 L_RESTORE_LDS_LOOP_W32:
    644 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
    645 	s_add_u32	m0, m0, 128						// 128 DW
    646 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128		//mem offset increased by 128DW
    647 	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
    648 	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W32					//LDS restore is complete?
    649 	s_branch	L_RESTORE_VGPR
    650 
    651 L_RESTORE_LDS_LOOP_W64:
    652 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
    653 	s_add_u32	m0, m0, 256						// 256 DW
    654 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256		//mem offset increased by 256DW
    655 	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
    656 	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64					//LDS restore is complete?
    657 
    658 	/* restore VGPRs */
    659 L_RESTORE_VGPR:
    660 	// VGPR SR memory offset : 0
    661 	s_mov_b32	s_restore_mem_offset, 0x0
    662  	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
    663 	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
    664 	s_and_b32	m0, m0, 1
    665 	s_cmp_eq_u32	m0, 1
    666 	s_cbranch_scc1	L_ENABLE_RESTORE_VGPR_EXEC_HI
    667 	s_mov_b32	exec_hi, 0x00000000
    668 	s_branch	L_RESTORE_VGPR_NORMAL
    669 L_ENABLE_RESTORE_VGPR_EXEC_HI:
    670 	s_mov_b32	exec_hi, 0xFFFFFFFF
    671 L_RESTORE_VGPR_NORMAL:
    672 	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
    673 	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, 1
    674 	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
    675 	//determine it is wave32 or wave64
    676 	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
    677 	s_and_b32	m0, m0, 1
    678 	s_cmp_eq_u32	m0, 1
    679 	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64
    680 
    681 	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    682 
    683 	// VGPR load using dw burst
    684 	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v1, v0 will be the last
    685 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4
    686 	s_mov_b32	m0, 4							//VGPR initial index value = 4
    687 
    688 L_RESTORE_VGPR_WAVE32_LOOP:
    689 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
    690 	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
    691 	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
    692 	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
    693 	s_waitcnt	vmcnt(0)
    694 	v_movreld_b32	v0, v0							//v[0+m0] = v0
    695 	v_movreld_b32	v1, v1
    696 	v_movreld_b32	v2, v2
    697 	v_movreld_b32	v3, v3
    698 	s_add_u32	m0, m0, 4						//next vgpr index
    699 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4	//every buffer_load_dword does 128 bytes
    700 	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
    701 	s_cbranch_scc1	L_RESTORE_VGPR_WAVE32_LOOP				//VGPR restore (except v0) is complete?
    702 
    703 	/* VGPR restore on v0 */
    704 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
    705 	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
    706 	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
    707 	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
    708 
    709 	s_branch	L_RESTORE_SGPR
    710 
    711 L_RESTORE_VGPR_WAVE64:
    712 	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    713 
    714 	// VGPR load using dw burst
    715 	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v4, v0 will be the last
    716 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4
    717 	s_mov_b32	m0, 4							//VGPR initial index value = 4
    718 
    719 L_RESTORE_VGPR_WAVE64_LOOP:
    720 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
    721 	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
    722 	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
    723 	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
    724 	s_waitcnt	vmcnt(0)
    725 	v_movreld_b32	v0, v0							//v[0+m0] = v0
    726 	v_movreld_b32	v1, v1
    727 	v_movreld_b32	v2, v2
    728 	v_movreld_b32	v3, v3
    729 	s_add_u32	m0, m0, 4						//next vgpr index
    730 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4	//every buffer_load_dword does 256 bytes
    731 	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
    732 	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64_LOOP				//VGPR restore (except v0) is complete?
    733 
    734 	//Below part will be the restore shared vgpr part (new for gfx10)
    735 	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)	//shared_vgpr_size
    736 	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
    737 	s_cbranch_scc0	L_RESTORE_V0						//no shared_vgpr used?
    738 	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 3		//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
    739 	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
    740 	//restore shared_vgpr will start from the index of m0
    741 	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, m0
    742 	s_mov_b32	exec_lo, 0xFFFFFFFF
    743 	s_mov_b32	exec_hi, 0x00000000
    744 L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
    745 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
    746 	s_waitcnt	vmcnt(0)
    747 	v_movreld_b32	v0, v0							//v[0+m0] = v0
    748 	s_add_u32	m0, m0, 1						//next vgpr index
    749 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128
    750 	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
    751 	s_cbranch_scc1	L_RESTORE_SHARED_VGPR_WAVE64_LOOP			//VGPR restore (except v0) is complete?
    752 
    753 	s_mov_b32	exec_hi, 0xFFFFFFFF					//restore back exec_hi before restoring V0!!
    754 
    755 	/* VGPR restore on v0 */
    756 L_RESTORE_V0:
    757 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
    758 	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
    759 	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
    760 	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
    761 	s_waitcnt	vmcnt(0)
    762 
    763 	/* restore SGPRs */
    764 	//will be 2+8+16*6
    765 	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
    766 L_RESTORE_SGPR:
    767 	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
    768 	get_svgpr_size_bytes(s_restore_tmp)
    769 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
    770 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
    771 	s_sub_u32	s_restore_mem_offset, s_restore_mem_offset, 20*4	//s108~s127 is not saved
    772 
    773 	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    774 
    775 	s_mov_b32	m0, s_sgpr_save_num
    776 
    777 	read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
    778 	s_waitcnt	lgkmcnt(0)
    779 
    780 	s_sub_u32	m0, m0, 4						// Restore from S[0] to S[104]
    781 	s_nop		0							// hazard SALU M0=> S_MOVREL
    782 
    783 	s_movreld_b64	s0, s0							//s[0+m0] = s0
    784 	s_movreld_b64	s2, s2
    785 
    786 	read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
    787 	s_waitcnt	lgkmcnt(0)
    788 
    789 	s_sub_u32	m0, m0, 8						// Restore from S[0] to S[96]
    790 	s_nop		0							// hazard SALU M0=> S_MOVREL
    791 
    792 	s_movreld_b64	s0, s0							//s[0+m0] = s0
    793 	s_movreld_b64	s2, s2
    794 	s_movreld_b64	s4, s4
    795 	s_movreld_b64	s6, s6
    796 
    797  L_RESTORE_SGPR_LOOP:
    798 	read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
    799 	s_waitcnt	lgkmcnt(0)
    800 
    801 	s_sub_u32	m0, m0, 16						// Restore from S[n] to S[0]
    802 	s_nop		0							// hazard SALU M0=> S_MOVREL
    803 
    804 	s_movreld_b64	s0, s0							//s[0+m0] = s0
    805 	s_movreld_b64	s2, s2
    806 	s_movreld_b64	s4, s4
    807 	s_movreld_b64	s6, s6
    808 	s_movreld_b64	s8, s8
    809 	s_movreld_b64	s10, s10
    810 	s_movreld_b64	s12, s12
    811 	s_movreld_b64	s14, s14
    812 
    813 	s_cmp_eq_u32	m0, 0							//scc = (m0 < s_sgpr_save_num) ? 1 : 0
    814 	s_cbranch_scc0	L_RESTORE_SGPR_LOOP
    815 
    816 	/* restore HW registers */
    817 L_RESTORE_HWREG:
    818 	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
    819 	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
    820 	get_svgpr_size_bytes(s_restore_tmp)
    821 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
    822 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
    823 
    824 	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
    825 
    826 	read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
    827 	read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
    828 	read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
    829 	read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
    830 	read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
    831 	read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
    832 	read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
    833 	read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
    834 	read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
    835 	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
    836 	s_waitcnt	lgkmcnt(0)
    837 
    838 	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
    839 
    840 	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
    841 	s_waitcnt	lgkmcnt(0)						//from now on, it is safe to restore STATUS and IB_STS
    842 
    843 	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
    844 
    845 	s_mov_b32	s_restore_tmp, s_restore_pc_hi
    846 	s_and_b32	s_restore_pc_hi, s_restore_tmp, 0x0000ffff		//pc[47:32] //Do it here in order not to affect STATUS
    847 
    848 	s_mov_b32	m0, s_restore_m0
    849 	s_mov_b32	exec_lo, s_restore_exec_lo
    850 	s_mov_b32	exec_hi, s_restore_exec_hi
    851 
    852 	s_and_b32	s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
    853 	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
    854 	s_setreg_b32	hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
    855 	s_and_b32	s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
    856 	s_lshr_b32	s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
    857 	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
    858 	s_setreg_b32	hwreg(HW_REG_MODE), s_restore_mode
    859 	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK
    860 	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
    861 	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
    862 	s_mov_b32	s_restore_mode, 0x0
    863 	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
    864 	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK
    865 	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
    866 	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
    867 	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
    868 	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK
    869 	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
    870 	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT
    871 	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
    872 
    873 	s_and_b32	s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
    874 	s_lshr_b32	s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
    875 	s_setreg_b32 	hwreg(HW_REG_IB_STS), s_restore_mode
    876 
    877 	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
    878 	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
    879 	s_setreg_b32	hwreg(HW_REG_STATUS), s_restore_status			// SCC is included, which is changed by previous salu
    880 
    881 	s_barrier								//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
    882 
    883 	s_rfe_b64	s_restore_pc_lo						//Return to the main shader program and resume execution
    884 
    885 L_END_PGM:
    886 	s_endpgm
    887 end
    888 
    889 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
    890 	s_mov_b32	exec_lo, m0
    891 	s_mov_b32	m0, s_mem_offset
    892 	s_buffer_store_dword	s, s_rsrc, m0 glc:1
    893 	s_add_u32	s_mem_offset, s_mem_offset, 4
    894 	s_mov_b32	m0, exec_lo
    895 end
    896 
    897 
    898 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
    899 	s_buffer_store_dwordx4	s[0], s_rsrc, 0 glc:1
    900 	s_buffer_store_dwordx4	s[4], s_rsrc, 16 glc:1
    901 	s_buffer_store_dwordx4	s[8], s_rsrc, 32 glc:1
    902 	s_buffer_store_dwordx4	s[12], s_rsrc, 48 glc:1
    903 	s_add_u32	s_rsrc[0], s_rsrc[0], 4*16
    904 	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0
    905 end
    906 
    907 function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
    908 	s_buffer_store_dwordx4	s[0], s_rsrc, 0 glc:1
    909 	s_buffer_store_dwordx4	s[4], s_rsrc, 16 glc:1
    910 	s_buffer_store_dwordx4	s[8], s_rsrc, 32 glc:1
    911 	s_add_u32	s_rsrc[0], s_rsrc[0], 4*12
    912 	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0
    913 end
    914 
    915 
    916 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
    917 	s_buffer_load_dword	s, s_rsrc, s_mem_offset glc:1
    918 	s_add_u32	s_mem_offset, s_mem_offset, 4
    919 end
    920 
    921 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
    922 	s_sub_u32	s_mem_offset, s_mem_offset, 4*16
    923 	s_buffer_load_dwordx16	s, s_rsrc, s_mem_offset glc:1
    924 end
    925 
    926 function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
    927 	s_sub_u32	s_mem_offset, s_mem_offset, 4*8
    928 	s_buffer_load_dwordx8	s, s_rsrc, s_mem_offset glc:1
    929 end
    930 
    931 function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
    932 	s_sub_u32	s_mem_offset, s_mem_offset, 4*4
    933 	s_buffer_load_dwordx4	s, s_rsrc, s_mem_offset glc:1
    934 end
    935 
    936 
    937 function get_lds_size_bytes(s_lds_size_byte)
    938 	s_getreg_b32	s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
    939 	s_lshl_b32	s_lds_size_byte, s_lds_size_byte, 8			//LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
    940 end
    941 
    942 function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
    943 	s_getreg_b32	s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
    944 	s_add_u32	s_vgpr_size_byte, s_vgpr_size_byte, 1
    945 	s_lshr_b32	m0, s_size, S_WAVE_SIZE
    946 	s_and_b32	m0, m0, 1
    947 	s_cmp_eq_u32	m0, 1
    948 	s_cbranch_scc1	L_ENABLE_SHIFT_W64
    949 	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+7)		//Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
    950 	s_branch	L_SHIFT_DONE
    951 L_ENABLE_SHIFT_W64:
    952 	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+8)		//Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
    953 L_SHIFT_DONE:
    954 end
    955 
    956 function get_svgpr_size_bytes(s_svgpr_size_byte)
    957 	s_getreg_b32	s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
    958 	s_lshl_b32	s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
    959 end
    960 
    961 function get_sgpr_size_bytes
    962 	return 512
    963 end
    964 
    965 function get_hwreg_size_bytes
    966 	return 128
    967 end
    968