Linux Audio

Check our new training course

Loading...
v4.17
   1/*
   2 * Copyright 2015-2017 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 */
  22
  23#if 0
  24HW (VI) source code for CWSR trap handler
  25#Version 18 + multiple trap handler
  26
  27// this performance-optimal version was originally from Seven Xu at SRDC
  28
  29// Revison #18   --...
  30/* Rev History
  31** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
  32** #4. SR Memory Layout:
  33**             1. VGPR-SGPR-HWREG-{LDS}
  34**             2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
  35** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
  36** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
  37** #7. Update: 1. don't barrier if noLDS
  38** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
  39**             2. Fix SQ issue by s_sleep 2
  40** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
  41**             2. optimize s_buffer save by burst 16sgprs...
  42** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
  43** #11. Update 1. Add 2 more timestamp for debug version
  44** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
  45** #13. Integ  1. Always use MUBUF for PV trap shader...
  46** #14. Update 1. s_buffer_store soft clause...
  47** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
  48** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
  49** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
  50**             2. PERF - Save LDS before save VGPR to cover LDS save long latency...
  51** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
  52**             2. FUNC - Handle non-CWSR traps
  53*/
  54
  55var G8SR_WDMEM_HWREG_OFFSET = 0
  56var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
  57
  58// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
  59
  60var G8SR_DEBUG_TIMESTAMP = 0
  61var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4  // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
  62var s_g8sr_ts_save_s    = s[34:35]   // save start
  63var s_g8sr_ts_sq_save_msg  = s[36:37]   // The save shader send SAVEWAVE msg to spi
  64var s_g8sr_ts_spi_wrexec   = s[38:39]   // the SPI write the sr address to SQ
  65var s_g8sr_ts_save_d    = s[40:41]   // save end
  66var s_g8sr_ts_restore_s = s[42:43]   // restore start
  67var s_g8sr_ts_restore_d = s[44:45]   // restore end
  68
  69var G8SR_VGPR_SR_IN_DWX4 = 0
  70var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000    // DWx4 stride is 4*4Bytes
  71var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
  72
  73
  74/*************************************************************************/
  75/*                  control on how to run the shader                     */
  76/*************************************************************************/
  77//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
  78var EMU_RUN_HACK                    =   0
  79var EMU_RUN_HACK_RESTORE_NORMAL     =   0
  80var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =   0
  81var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =   0
  82var EMU_RUN_HACK_SAVE_FIRST_TIME    =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
  83var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
  84var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
  85var SAVE_LDS                        =   1
  86var WG_BASE_ADDR_LO                 =   0x9000a000
  87var WG_BASE_ADDR_HI                 =   0x0
  88var WAVE_SPACE                      =   0x5000              //memory size that each wave occupies in workgroup state mem
  89var CTX_SAVE_CONTROL                =   0x0
  90var CTX_RESTORE_CONTROL             =   CTX_SAVE_CONTROL
  91var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
  92var SGPR_SAVE_USE_SQC               =   1                   //use SQC D$ to do the write
  93var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
  94var SWIZZLE_EN                      =   0                   //whether we use swizzled buffer addressing
  95
  96/**************************************************************************/
  97/*                      variables                                         */
  98/**************************************************************************/
  99var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
 100var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
 
 101var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
 
 
 
 
 102
 103var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
 104var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
 105var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
 106var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
 107var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
 108var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 3                     //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
 109
 110var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =   0x400
 111var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF                   // Exception mask
 112var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =   10
 113var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =   0x100
 114var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =   8
 115var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK    =   0x3FF
 116var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT   =   0x0
 117var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE    =   10
 118var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK   =   0xFFFFF800
 119var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT  =   11
 120var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE   =   21
 121
 122var SQ_WAVE_IB_STS_RCNT_SHIFT           =   16                  //FIXME
 123var SQ_WAVE_IB_STS_RCNT_SIZE            =   4                   //FIXME
 124var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =   15                  //FIXME
 125var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
 126var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF    //FIXME
 127
 128var SQ_BUF_RSRC_WORD1_ATC_SHIFT     =   24
 129var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =   27
 130
 131
 132/*      Save        */
 133var S_SAVE_BUF_RSRC_WORD1_STRIDE        =   0x00040000          //stride is 4 bytes
 134var S_SAVE_BUF_RSRC_WORD3_MISC          =   0x00807FAC          //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
 135
 136var S_SAVE_SPI_INIT_ATC_MASK            =   0x08000000          //bit[27]: ATC bit
 137var S_SAVE_SPI_INIT_ATC_SHIFT           =   27
 138var S_SAVE_SPI_INIT_MTYPE_MASK          =   0x70000000          //bit[30:28]: Mtype
 139var S_SAVE_SPI_INIT_MTYPE_SHIFT         =   28
 140var S_SAVE_SPI_INIT_FIRST_WAVE_MASK     =   0x04000000          //bit[26]: FirstWaveInTG
 141var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT    =   26
 142
 143var S_SAVE_PC_HI_RCNT_SHIFT             =   28                  //FIXME  check with Brian to ensure all fields other than PC[47:0] can be used
 144var S_SAVE_PC_HI_RCNT_MASK              =   0xF0000000          //FIXME
 145var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT     =   27                  //FIXME
 146var S_SAVE_PC_HI_FIRST_REPLAY_MASK      =   0x08000000          //FIXME
 147
 148var s_save_spi_init_lo              =   exec_lo
 149var s_save_spi_init_hi              =   exec_hi
 150
 151                                                //tba_lo and tba_hi need to be saved/restored
 152var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
 153var s_save_pc_hi            =   ttmp1
 154var s_save_exec_lo          =   ttmp2
 155var s_save_exec_hi          =   ttmp3
 156var s_save_status           =   ttmp4
 157var s_save_trapsts          =   ttmp5           //not really used until the end of the SAVE routine
 158var s_save_xnack_mask_lo    =   ttmp6
 159var s_save_xnack_mask_hi    =   ttmp7
 160var s_save_buf_rsrc0        =   ttmp8
 161var s_save_buf_rsrc1        =   ttmp9
 162var s_save_buf_rsrc2        =   ttmp10
 163var s_save_buf_rsrc3        =   ttmp11
 164
 165var s_save_mem_offset       =   tma_lo
 166var s_save_alloc_size       =   s_save_trapsts          //conflict
 167var s_save_tmp              =   s_save_buf_rsrc2        //shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
 168var s_save_m0               =   tma_hi
 169
 170/*      Restore     */
 171var S_RESTORE_BUF_RSRC_WORD1_STRIDE         =   S_SAVE_BUF_RSRC_WORD1_STRIDE
 172var S_RESTORE_BUF_RSRC_WORD3_MISC           =   S_SAVE_BUF_RSRC_WORD3_MISC
 173
 174var S_RESTORE_SPI_INIT_ATC_MASK             =   0x08000000          //bit[27]: ATC bit
 175var S_RESTORE_SPI_INIT_ATC_SHIFT            =   27
 176var S_RESTORE_SPI_INIT_MTYPE_MASK           =   0x70000000          //bit[30:28]: Mtype
 177var S_RESTORE_SPI_INIT_MTYPE_SHIFT          =   28
 178var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      =   0x04000000          //bit[26]: FirstWaveInTG
 179var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     =   26
 180
 181var S_RESTORE_PC_HI_RCNT_SHIFT              =   S_SAVE_PC_HI_RCNT_SHIFT
 182var S_RESTORE_PC_HI_RCNT_MASK               =   S_SAVE_PC_HI_RCNT_MASK
 183var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT      =   S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
 184var S_RESTORE_PC_HI_FIRST_REPLAY_MASK       =   S_SAVE_PC_HI_FIRST_REPLAY_MASK
 185
 186var s_restore_spi_init_lo                   =   exec_lo
 187var s_restore_spi_init_hi                   =   exec_hi
 188
 189var s_restore_mem_offset        =   ttmp2
 190var s_restore_alloc_size        =   ttmp3
 191var s_restore_tmp               =   ttmp6               //tba_lo/hi need to be restored
 192var s_restore_mem_offset_save   =   s_restore_tmp       //no conflict
 193
 194var s_restore_m0            =   s_restore_alloc_size    //no conflict
 195
 196var s_restore_mode          =   ttmp7
 197
 198var s_restore_pc_lo         =   ttmp0
 199var s_restore_pc_hi         =   ttmp1
 200var s_restore_exec_lo       =   tma_lo                  //no conflict
 201var s_restore_exec_hi       =   tma_hi                  //no conflict
 202var s_restore_status        =   ttmp4
 203var s_restore_trapsts       =   ttmp5
 204var s_restore_xnack_mask_lo =   xnack_mask_lo
 205var s_restore_xnack_mask_hi =   xnack_mask_hi
 206var s_restore_buf_rsrc0     =   ttmp8
 207var s_restore_buf_rsrc1     =   ttmp9
 208var s_restore_buf_rsrc2     =   ttmp10
 209var s_restore_buf_rsrc3     =   ttmp11
 210
 211/**************************************************************************/
 212/*                      trap handler entry points                         */
 213/**************************************************************************/
 214/* Shader Main*/
 215
 216shader main
 217  asic(VI)
 218  type(CS)
 219
 220
 221    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))                   //hack to use trap_id for determining save/restore
 222        //FIXME VCCZ un-init assertion s_getreg_b32     s_save_status, hwreg(HW_REG_STATUS)         //save STATUS since we will change SCC
 223        s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000              //change SCC
 224        s_cmp_eq_u32 s_save_tmp, 0x007e0000                         //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
 225        s_cbranch_scc0 L_JUMP_TO_RESTORE                            //do not need to recover STATUS here  since we are going to RESTORE
 226        //FIXME  s_setreg_b32   hwreg(HW_REG_STATUS),   s_save_status       //need to recover STATUS since we are going to SAVE
 227        s_branch L_SKIP_RESTORE                                     //NOT restore, SAVE actually
 228    else
 229        s_branch L_SKIP_RESTORE                                     //NOT restore. might be a regular trap or save
 230    end
 231
 232L_JUMP_TO_RESTORE:
 233    s_branch L_RESTORE                                              //restore
 234
 235L_SKIP_RESTORE:
 236
 237    s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                             //save STATUS since we will change SCC
 238    s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
 239    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
 240    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
 241    s_cbranch_scc1  L_SAVE                                      //this is the operation for save
 242
 243    // *********    Handle non-CWSR traps       *******************
 244if (!EMU_RUN_HACK)
 245    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
 246    s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
 247    s_waitcnt lgkmcnt(0)
 248    s_or_b32        ttmp7, ttmp8, ttmp9
 249    s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
 250    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
 251    s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler
 252
 253L_NO_NEXT_TRAP:
 254    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
 255    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
 256    s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
 257    s_add_u32       ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
 258    s_addc_u32  ttmp1, ttmp1, 0
 259L_EXCP_CASE:
 260    s_and_b32   ttmp1, ttmp1, 0xFFFF
 261    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
 262    s_rfe_b64       [ttmp0, ttmp1]
 263end
 264    // *********        End handling of non-CWSR traps   *******************
 265
 266/**************************************************************************/
 267/*                      save routine                                      */
 268/**************************************************************************/
 269
 270L_SAVE:
 271
 272if G8SR_DEBUG_TIMESTAMP
 273        s_memrealtime   s_g8sr_ts_save_s
 274        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
 275end
 276
 277    //check whether there is mem_viol
 278    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
 279    s_and_b32   s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
 280    s_cbranch_scc0  L_NO_PC_REWIND
 281
 282    //if so, need rewind PC assuming GDS operation gets NACKed
 283    s_mov_b32       s_save_tmp, 0                                                           //clear mem_viol bit
 284    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
 285    s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
 286    s_sub_u32       s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
 287    s_subb_u32      s_save_pc_hi, s_save_pc_hi, 0x0           // -scc
 288
 289L_NO_PC_REWIND:
 290    s_mov_b32       s_save_tmp, 0                                                           //clear saveCtx bit
 291    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
 292
 293    s_mov_b32       s_save_xnack_mask_lo,   xnack_mask_lo                                   //save XNACK_MASK
 294    s_mov_b32       s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
 295    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)                   //save RCNT
 296    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
 297    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
 298    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
 299    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
 300    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
 301    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                                        //clear RCNT and FIRST_REPLAY in IB_STS
 302    s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
 303
 304    s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
 305
 306    /*      inform SPI the readiness and wait for SPI's go signal */
 307    s_mov_b32       s_save_exec_lo, exec_lo                                                 //save EXEC and use EXEC for the go signal from SPI
 308    s_mov_b32       s_save_exec_hi, exec_hi
 309    s_mov_b64       exec,   0x0                                                             //clear EXEC to get ready to receive
 310
 311if G8SR_DEBUG_TIMESTAMP
 312        s_memrealtime  s_g8sr_ts_sq_save_msg
 313        s_waitcnt lgkmcnt(0)
 314end
 315
 316    if (EMU_RUN_HACK)
 317
 318    else
 319        s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
 320    end
 
 
 
 321
 322  L_SLEEP:
 323    s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
 324
 325    if (EMU_RUN_HACK)
 326
 327    else
 328        s_cbranch_execz L_SLEEP
 329    end
 330
 331if G8SR_DEBUG_TIMESTAMP
 332        s_memrealtime  s_g8sr_ts_spi_wrexec
 333        s_waitcnt lgkmcnt(0)
 334end
 335
 336    /*      setup Resource Contants    */
 337    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
 338        //calculate wd_addr using absolute thread id
 339        v_readlane_b32 s_save_tmp, v9, 0
 340        s_lshr_b32 s_save_tmp, s_save_tmp, 6
 341        s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
 342        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
 343        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
 344        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
 345    else
 346    end
 347    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
 348        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
 349        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
 350        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
 351    else
 352    end
 353
 354
 355    s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                                                      //base_addr_lo
 356    s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                                          //base_addr_hi
 357    s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
 358    s_mov_b32       s_save_buf_rsrc2,   0                                                                       //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
 359    s_mov_b32       s_save_buf_rsrc3,   S_SAVE_BUF_RSRC_WORD3_MISC
 360    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
 361    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)         //get ATC bit into position
 362    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or ATC
 363    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
 364    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)     //get MTYPE bits into position
 365    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or MTYPE
 366
 367    //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
 368    s_mov_b32       s_save_m0,          m0                                                                  //save M0
 369
 370    /*      global mem offset           */
 371    s_mov_b32       s_save_mem_offset,  0x0                                                                     //mem offset initial value = 0
 372
 373
 374
 375
 376    /*      save HW registers   */
 377    //////////////////////////////
 378
 379  L_SAVE_HWREG:
 380        // HWREG SR memory offset : size(VGPR)+size(SGPR)
 381       get_vgpr_size_bytes(s_save_mem_offset)
 382       get_sgpr_size_bytes(s_save_tmp)
 383       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
 384
 385
 386    s_mov_b32       s_save_buf_rsrc2, 0x4                               //NUM_RECORDS   in bytes
 387    if (SWIZZLE_EN)
 388        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
 389    else
 390        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
 391    end
 392
 393
 394    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)                  //M0
 395
 396    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
 397        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
 398        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
 399        s_mov_b32   tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
 400        s_mov_b32   tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
 401    end
 402
 403    write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)                   //PC
 404    write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
 405    write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)             //EXEC
 406    write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
 407    write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)              //STATUS
 408
 409    //s_save_trapsts conflicts with s_save_alloc_size
 410    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
 411    write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)             //TRAPSTS
 412
 413    write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_LO
 414    write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_HI
 415
 416    //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
 417    s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                   //MODE
 418    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
 419    write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_LO
 420    write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_HI
 421
 422
 423
 424    /*      the first wave in the threadgroup    */
 425        // save fist_wave bits in tba_hi unused bit.26
 426    s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK     // extract fisrt wave bit
 427    //s_or_b32        tba_hi, s_save_tmp, tba_hi                                        // save first wave bit to tba_hi.bits[26]
 428    s_mov_b32        s_save_exec_hi, 0x0
 429    s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi                          // save first wave bit to s_save_exec_hi.bits[26]
 430
 431
 432    /*          save SGPRs      */
 433        // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
 434    //////////////////////////////
 435
 436    // SGPR SR memory offset : size(VGPR)
 437    get_vgpr_size_bytes(s_save_mem_offset)
 438    // TODO, change RSRC word to rearrange memory layout for SGPRS
 439
 440    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)               //spgr_size
 441    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
 442    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4                         //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
 443
 444    if (SGPR_SAVE_USE_SQC)
 445        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 2                    //NUM_RECORDS in bytes
 446    else
 447        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 8                    //NUM_RECORDS in bytes (64 threads)
 448    end
 449
 450    if (SWIZZLE_EN)
 451        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
 452    else
 453        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
 454    end
 455
 456
 457    // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
 458    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
 459    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
 460    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
 461    s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
 462
 463    s_mov_b32       m0, 0x0                         //SGPR initial index value =0
 464  L_SAVE_SGPR_LOOP:
 465    // SGPR is allocated in 16 SGPR granularity
 466    s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
 467    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
 468    s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
 469    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
 470    s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
 471    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
 472    s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
 473    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
 474
 475    write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
 476    s_add_u32       m0, m0, 16                                                      //next sgpr index
 477    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
 478    s_cbranch_scc1  L_SAVE_SGPR_LOOP                                    //SGPR save is complete?
 479    // restore s_save_buf_rsrc0,1
 480    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
 481    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
 482
 483
 484
 485
 486    /*          save first 4 VGPR, then LDS save could use   */
 487        // each wave will alloc 4 vgprs at least...
 488    /////////////////////////////////////////////////////////////////////////////////////
 489
 490    s_mov_b32       s_save_mem_offset, 0
 491    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
 492    s_mov_b32       exec_hi, 0xFFFFFFFF
 493
 494    if (SWIZZLE_EN)
 495        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
 496    else
 497        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
 498    end
 499
 500
 501    // VGPR Allocated in 4-GPR granularity
 502
 503if G8SR_VGPR_SR_IN_DWX4
 504        // the const stride for DWx4 is 4*4 bytes
 505        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
 506        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
 507
 508        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 509
 510        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
 511        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
 512else
 513        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 514        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
 515        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
 516        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
 517end
 518
 519
 520
 521    /*          save LDS        */
 522    //////////////////////////////
 523
 524  L_SAVE_LDS:
 525
 526        // Change EXEC to all threads...
 527    s_mov_b32       exec_lo, 0xFFFFFFFF   //need every thread from now on
 528    s_mov_b32       exec_hi, 0xFFFFFFFF
 529
 530    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)             //lds_size
 531    s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                //lds_size is zero?
 532    s_cbranch_scc0  L_SAVE_LDS_DONE                                                                            //no lds used? jump to L_SAVE_DONE
 533
 534    s_barrier               //LDS is used? wait for other waves in the same TG
 535    //s_and_b32     s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
 536    s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
 537    s_cbranch_scc0  L_SAVE_LDS_DONE
 538
 539        // first wave do LDS save;
 540
 541    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6                         //LDS size in dwords = lds_size * 64dw
 542    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //LDS size in bytes
 543    s_mov_b32       s_save_buf_rsrc2,  s_save_alloc_size                            //NUM_RECORDS in bytes
 544
 545    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
 546    //
 547    get_vgpr_size_bytes(s_save_mem_offset)
 548    get_sgpr_size_bytes(s_save_tmp)
 549    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
 550    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
 551
 552
 553    if (SWIZZLE_EN)
 554        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0       //FIXME need to use swizzle to enable bounds checking?
 555    else
 556        s_mov_b32       s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
 557    end
 558
 559    s_mov_b32       m0, 0x0                                               //lds_offset initial value = 0
 560
 561
 562var LDS_DMA_ENABLE = 0
 563var UNROLL = 0
 564if UNROLL==0 && LDS_DMA_ENABLE==1
 565        s_mov_b32  s3, 256*2
 566        s_nop 0
 567        s_nop 0
 568        s_nop 0
 569  L_SAVE_LDS_LOOP:
 570        //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
 571    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
 572            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1            // first 64DW
 573            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
 574    end
 575
 576    s_add_u32       m0, m0, s3                                          //every buffer_store_lds does 256 bytes
 577    s_add_u32       s_save_mem_offset, s_save_mem_offset, s3                            //mem offset increased by 256 bytes
 578    s_cmp_lt_u32    m0, s_save_alloc_size                                               //scc=(m0 < s_save_alloc_size) ? 1 : 0
 579    s_cbranch_scc1  L_SAVE_LDS_LOOP                                                     //LDS save is complete?
 580
 581elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL  , has ichace miss
 582      // store from higest LDS address to lowest
 583      s_mov_b32  s3, 256*2
 584      s_sub_u32  m0, s_save_alloc_size, s3
 585      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
 586      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
 587      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
 588      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
 589      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
 590      s_nop 0
 591      s_nop 0
 592      s_nop 0   //pad 3 dw to let LDS_DMA align with 64Bytes
 593      s_getpc_b64 s[0:1]                              // reuse s[0:1], since s[0:1] already saved
 594      s_add_u32   s0, s0,s_save_alloc_size
 595      s_addc_u32  s1, s1, 0
 596      s_setpc_b64 s[0:1]
 597
 598
 599       for var i =0; i< 128; i++
 600            // be careful to make here a 64Byte aligned address, which could improve performance...
 601            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0           // first 64DW
 602            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256           // second 64DW
 603
 604        if i!=127
 605        s_sub_u32  m0, m0, s3      // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
 606            s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
 607            end
 608       end
 609
 610else   // BUFFER_STORE
 611      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
 612      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
 613      v_mul_i32_i24 v2, v3, 8   // tid*8
 614      v_mov_b32 v3, 256*2
 615      s_mov_b32 m0, 0x10000
 616      s_mov_b32 s0, s_save_buf_rsrc3
 617      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid
 618      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
 619
 620L_SAVE_LDS_LOOP_VECTOR:
 621      ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
 622      s_waitcnt lgkmcnt(0)
 623      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
 624//      s_waitcnt vmcnt(0)
 625      v_add_u32 v2, vcc[0:1], v2, v3
 626      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
 627      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
 628
 629      // restore rsrc3
 630      s_mov_b32 s_save_buf_rsrc3, s0
 631
 632end
 633
 634L_SAVE_LDS_DONE:
 635
 636
 637    /*          save VGPRs  - set the Rest VGPRs        */
 638    //////////////////////////////////////////////////////////////////////////////////////
 639  L_SAVE_VGPR:
 640    // VGPR SR memory offset: 0
 641    // TODO rearrange the RSRC words to use swizzle for VGPR save...
 642
 643    s_mov_b32       s_save_mem_offset, (0+256*4)                                    // for the rest VGPRs
 644    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
 645    s_mov_b32       exec_hi, 0xFFFFFFFF
 646
 647    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)                   //vpgr_size
 648    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
 649    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
 650    s_lshl_b32      s_save_buf_rsrc2,  s_save_alloc_size, 8                         //NUM_RECORDS in bytes (64 threads*4)
 651    if (SWIZZLE_EN)
 652        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
 653    else
 654        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
 655    end
 656
 657
 658    // VGPR Allocated in 4-GPR granularity
 659
 660if G8SR_VGPR_SR_IN_DWX4
 661        // the const stride for DWx4 is 4*4 bytes
 662        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
 663        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
 664
 665        s_mov_b32         m0, 4     // skip first 4 VGPRs
 666        s_cmp_lt_u32      m0, s_save_alloc_size
 667        s_cbranch_scc0    L_SAVE_VGPR_LOOP_END      // no more vgprs
 668
 669        s_set_gpr_idx_on  m0, 0x1   // This will change M0
 670        s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000  // because above inst change m0
 671L_SAVE_VGPR_LOOP:
 672        v_mov_b32         v0, v0   // v0 = v[0+m0]
 673        v_mov_b32         v1, v1
 674        v_mov_b32         v2, v2
 675        v_mov_b32         v3, v3
 676
 677
 678        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 679        s_add_u32         m0, m0, 4
 680        s_add_u32         s_save_mem_offset, s_save_mem_offset, 256*4
 681        s_cmp_lt_u32      m0, s_save_alloc_size
 682    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
 683    s_set_gpr_idx_off
 684L_SAVE_VGPR_LOOP_END:
 685
 686        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
 687        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
 688else
 689    // VGPR store using dw burst
 690    s_mov_b32         m0, 0x4   //VGPR initial index value =0
 691    s_cmp_lt_u32      m0, s_save_alloc_size
 692    s_cbranch_scc0    L_SAVE_VGPR_END
 693
 694
 695    s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
 696    s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000                    //add 0x1000 since we compare m0 against it later
 697
 698  L_SAVE_VGPR_LOOP:
 699    v_mov_b32       v0, v0              //v0 = v[0+m0]
 700    v_mov_b32       v1, v1              //v0 = v[0+m0]
 701    v_mov_b32       v2, v2              //v0 = v[0+m0]
 702    v_mov_b32       v3, v3              //v0 = v[0+m0]
 703
 704    if(USE_MTBUF_INSTEAD_OF_MUBUF)
 705        tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
 706    else
 707        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 708        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
 709        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
 710        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
 711    end
 712
 713    s_add_u32       m0, m0, 4                                                       //next vgpr index
 714    s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4                     //every buffer_store_dword does 256 bytes
 715    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
 716    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
 717    s_set_gpr_idx_off
 718end
 719
 720L_SAVE_VGPR_END:
 721
 722
 723
 724
 725
 726
 727    /*     S_PGM_END_SAVED  */                              //FIXME  graphics ONLY
 728    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
 729        s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
 730        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
 731        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
 732        s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
 733    else
 734    end
 735
 736// Save Done timestamp
 737if G8SR_DEBUG_TIMESTAMP
 738        s_memrealtime   s_g8sr_ts_save_d
 739        // SGPR SR memory offset : size(VGPR)
 740        get_vgpr_size_bytes(s_save_mem_offset)
 741        s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
 742        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
 743        // Need reset rsrc2??
 744        s_mov_b32 m0, s_save_mem_offset
 745        s_mov_b32 s_save_buf_rsrc2,  0x1000000                                  //NUM_RECORDS in bytes
 746        s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0       glc:1
 747end
 748
 749
 750    s_branch    L_END_PGM
 751
 752
 753
 754/**************************************************************************/
 755/*                      restore routine                                   */
 756/**************************************************************************/
 757
 758L_RESTORE:
 759    /*      Setup Resource Contants    */
 760    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
 761        //calculate wd_addr using absolute thread id
 762        v_readlane_b32 s_restore_tmp, v9, 0
 763        s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
 764        s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
 765        s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
 766        s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
 767        s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
 768    else
 769    end
 770
 771if G8SR_DEBUG_TIMESTAMP
 772        s_memrealtime   s_g8sr_ts_restore_s
 773        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
 774        // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
 775        s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
 776        s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
 777end
 778
 779
 780
 781    s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                           //base_addr_lo
 782    s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                               //base_addr_hi
 783    s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
 784    s_mov_b32       s_restore_buf_rsrc2,    0                                                                               //NUM_RECORDS initial value = 0 (in bytes)
 785    s_mov_b32       s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
 786    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
 787    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)       //get ATC bit into position
 788    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or ATC
 789    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
 790    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
 791    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or MTYPE
 792
 793    /*      global mem offset           */
 794//  s_mov_b32       s_restore_mem_offset, 0x0                               //mem offset initial value = 0
 795
 796    /*      the first wave in the threadgroup    */
 797    s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
 798    s_cbranch_scc0  L_RESTORE_VGPR
 799
 800    /*          restore LDS     */
 801    //////////////////////////////
 802  L_RESTORE_LDS:
 803
 804    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
 805    s_mov_b32       exec_hi, 0xFFFFFFFF
 806
 807    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)              //lds_size
 808    s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                  //lds_size is zero?
 809    s_cbranch_scc0  L_RESTORE_VGPR                                                          //no lds used? jump to L_RESTORE_VGPR
 810    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6                           //LDS size in dwords = lds_size * 64dw
 811    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //LDS size in bytes
 812    s_mov_b32       s_restore_buf_rsrc2,    s_restore_alloc_size                            //NUM_RECORDS in bytes
 813
 814    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
 815    //
 816    get_vgpr_size_bytes(s_restore_mem_offset)
 817    get_sgpr_size_bytes(s_restore_tmp)
 818    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
 819    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
 820
 821
 822    if (SWIZZLE_EN)
 823        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
 824    else
 825        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
 826    end
 827    s_mov_b32       m0, 0x0                                                                 //lds_offset initial value = 0
 828
 829  L_RESTORE_LDS_LOOP:
 830    if (SAVE_LDS)
 831        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
 832        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
 833    end
 834    s_add_u32       m0, m0, 256*2                                               // 128 DW
 835    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
 836    s_cmp_lt_u32    m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
 837    s_cbranch_scc1  L_RESTORE_LDS_LOOP                                                      //LDS restore is complete?
 838
 839
 840    /*          restore VGPRs       */
 841    //////////////////////////////
 842  L_RESTORE_VGPR:
 843        // VGPR SR memory offset : 0
 844    s_mov_b32       s_restore_mem_offset, 0x0
 845    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
 846    s_mov_b32       exec_hi, 0xFFFFFFFF
 847
 848    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)    //vpgr_size
 849    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
 850    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
 851    s_lshl_b32      s_restore_buf_rsrc2,  s_restore_alloc_size, 8                           //NUM_RECORDS in bytes (64 threads*4)
 852    if (SWIZZLE_EN)
 853        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
 854    else
 855        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
 856    end
 857
 858if G8SR_VGPR_SR_IN_DWX4
 859     get_vgpr_size_bytes(s_restore_mem_offset)
 860     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
 861
 862     // the const stride for DWx4 is 4*4 bytes
 863     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
 864     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
 865
 866     s_mov_b32         m0, s_restore_alloc_size
 867     s_set_gpr_idx_on  m0, 0x8    // Note.. This will change m0
 868
 869L_RESTORE_VGPR_LOOP:
 870     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
 871     s_waitcnt vmcnt(0)
 872     s_sub_u32         m0, m0, 4
 873     v_mov_b32         v0, v0   // v[0+m0] = v0
 874     v_mov_b32         v1, v1
 875     v_mov_b32         v2, v2
 876     v_mov_b32         v3, v3
 877     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
 878     s_cmp_eq_u32      m0, 0x8000
 879     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
 880     s_set_gpr_idx_off
 881
 882     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
 883     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
 884
 885else
 886    // VGPR load using dw burst
 887    s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
 888    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
 889    s_mov_b32       m0, 4                               //VGPR initial index value = 1
 890    s_set_gpr_idx_on  m0, 0x8                       //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
 891    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000                      //add 0x8000 since we compare m0 against it later
 892
 893  L_RESTORE_VGPR_LOOP:
 894    if(USE_MTBUF_INSTEAD_OF_MUBUF)
 895        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
 896    else
 897        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
 898        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
 899        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
 900        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
 901    end
 902    s_waitcnt       vmcnt(0)                                                                //ensure data ready
 903    v_mov_b32       v0, v0                                                                  //v[0+m0] = v0
 904    v_mov_b32       v1, v1
 905    v_mov_b32       v2, v2
 906    v_mov_b32       v3, v3
 907    s_add_u32       m0, m0, 4                                                               //next vgpr index
 908    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4                           //every buffer_load_dword does 256 bytes
 909    s_cmp_lt_u32    m0, s_restore_alloc_size                                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
 910    s_cbranch_scc1  L_RESTORE_VGPR_LOOP                                                     //VGPR restore (except v0) is complete?
 911    s_set_gpr_idx_off
 912                                                                                            /* VGPR restore on v0 */
 913    if(USE_MTBUF_INSTEAD_OF_MUBUF)
 914        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
 915    else
 916        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
 917        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
 918        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
 919        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
 920    end
 921
 922end
 923
 924    /*          restore SGPRs       */
 925    //////////////////////////////
 926
 927    // SGPR SR memory offset : size(VGPR)
 928    get_vgpr_size_bytes(s_restore_mem_offset)
 929    get_sgpr_size_bytes(s_restore_tmp)
 930    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
 931    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
 932    // TODO, change RSRC word to rearrange memory layout for SGPRS
 933
 934    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                //spgr_size
 935    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
 936    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4                           //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
 937
 938    if (SGPR_SAVE_USE_SQC)
 939        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 2                     //NUM_RECORDS in bytes
 940    else
 941        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 8                     //NUM_RECORDS in bytes (64 threads)
 942    end
 943    if (SWIZZLE_EN)
 944        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
 945    else
 946        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
 947    end
 948
 949    /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
 950       However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
 951    */
 952    s_mov_b32 m0, s_restore_alloc_size
 953
 954 L_RESTORE_SGPR_LOOP:
 955    read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
 956    s_waitcnt       lgkmcnt(0)                                                              //ensure data ready
 957
 958    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
 959
 960    s_movreld_b64   s0, s0      //s[0+m0] = s0
 961    s_movreld_b64   s2, s2
 962    s_movreld_b64   s4, s4
 963    s_movreld_b64   s6, s6
 964    s_movreld_b64   s8, s8
 965    s_movreld_b64   s10, s10
 966    s_movreld_b64   s12, s12
 967    s_movreld_b64   s14, s14
 968
 969    s_cmp_eq_u32    m0, 0               //scc = (m0 < s_restore_alloc_size) ? 1 : 0
 970    s_cbranch_scc0  L_RESTORE_SGPR_LOOP             //SGPR restore (except s0) is complete?
 971
 972    /*      restore HW registers    */
 973    //////////////////////////////
 974  L_RESTORE_HWREG:
 975
 976
 977if G8SR_DEBUG_TIMESTAMP
 978      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
 979      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
 980end
 981
 982    // HWREG SR memory offset : size(VGPR)+size(SGPR)
 983    get_vgpr_size_bytes(s_restore_mem_offset)
 984    get_sgpr_size_bytes(s_restore_tmp)
 985    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
 986
 987
 988    s_mov_b32       s_restore_buf_rsrc2, 0x4                                                //NUM_RECORDS   in bytes
 989    if (SWIZZLE_EN)
 990        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
 991    else
 992        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
 993    end
 994
 995    read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)                    //M0
 996    read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)             //PC
 997    read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
 998    read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)               //EXEC
 999    read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1000    read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)                //STATUS
1001    read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)               //TRAPSTS
1002    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_LO
1003    read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_HI
1004    read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)              //MODE
1005    read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_LO
1006    read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_HI
1007
1008    s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
1009
1010    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
1011
1012    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
1013    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
1014        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
1015        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
1016    end
1017    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
1018        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
1019        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
1020    end
1021
1022    s_mov_b32       m0,         s_restore_m0
1023    s_mov_b32       exec_lo,    s_restore_exec_lo
1024    s_mov_b32       exec_hi,    s_restore_exec_hi
1025
1026    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1027    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1028    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1029    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1030    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1031    //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
1032    s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
1033    //reuse s_restore_m0 as a temp register
1034    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
1035    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
1036    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
1037    s_mov_b32       s_restore_tmp, 0x0                                                                              //IB_STS is zero
1038    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
1039    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
1040    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
1041    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
1042    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
1043    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
1044    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
1045    s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
1046
 
1047    s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
1048    s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
1049    s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status     // SCC is included, which is changed by previous salu
1050
1051    s_barrier                                                   //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
1052
1053if G8SR_DEBUG_TIMESTAMP
1054    s_memrealtime s_g8sr_ts_restore_d
1055    s_waitcnt lgkmcnt(0)
1056end
1057
1058//  s_rfe_b64 s_restore_pc_lo                                   //Return to the main shader program and resume execution
1059    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc
1060
1061
1062/**************************************************************************/
1063/*                      the END                                           */
1064/**************************************************************************/
1065L_END_PGM:
1066    s_endpgm
1067
1068end
1069
1070
1071/**************************************************************************/
1072/*                      the helper functions                              */
1073/**************************************************************************/
1074
1075//Only for save hwreg to mem
1076function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1077        s_mov_b32 exec_lo, m0                   //assuming exec_lo is not needed anymore from this point on
1078        s_mov_b32 m0, s_mem_offset
1079        s_buffer_store_dword s, s_rsrc, m0      glc:1
1080        s_add_u32       s_mem_offset, s_mem_offset, 4
1081        s_mov_b32   m0, exec_lo
1082end
1083
1084
1085// HWREG are saved before SGPRs, so all HWREG could be use.
1086function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1087
1088        s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:1
1089        s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:1
1090        s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:1
1091        s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
1092        s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
1093        s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0             // +scc
1094end
1095
1096
1097function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1098    s_buffer_load_dword s, s_rsrc, s_mem_offset     glc:1
1099    s_add_u32       s_mem_offset, s_mem_offset, 4
1100end
1101
1102function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1103    s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset      glc:1
1104    s_sub_u32       s_mem_offset, s_mem_offset, 4*16
1105end
1106
1107
1108
1109function get_lds_size_bytes(s_lds_size_byte)
1110    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
1111    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)          // lds_size
1112    s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8                      //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
1113end
1114
1115function get_vgpr_size_bytes(s_vgpr_size_byte)
1116    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
1117    s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
1118    s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible
1119end
1120
1121function get_sgpr_size_bytes(s_sgpr_size_byte)
1122    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
1123    s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
1124    s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
1125end
1126
1127function get_hwreg_size_bytes
1128    return 128 //HWREG size 128 bytes
1129end
1130
1131
1132#endif
1133
1134static const uint32_t cwsr_trap_gfx8_hex[] = {
1135	0xbf820001, 0xbf820123,
1136	0xb8f4f802, 0x89748674,
1137	0xb8f5f803, 0x8675ff75,
1138	0x00000400, 0xbf850011,
1139	0xc00a1e37, 0x00000000,
1140	0xbf8c007f, 0x87777978,
1141	0xbf840002, 0xb974f802,
1142	0xbe801d78, 0xb8f5f803,
1143	0x8675ff75, 0x000001ff,
1144	0xbf850002, 0x80708470,
1145	0x82718071, 0x8671ff71,
1146	0x0000ffff, 0xb974f802,
1147	0xbe801f70, 0xb8f5f803,
1148	0x8675ff75, 0x00000100,
1149	0xbf840006, 0xbefa0080,
1150	0xb97a0203, 0x8671ff71,
1151	0x0000ffff, 0x80f08870,
1152	0x82f18071, 0xbefa0080,
1153	0xb97a0283, 0xbef60068,
1154	0xbef70069, 0xb8fa1c07,
1155	0x8e7a9c7a, 0x87717a71,
1156	0xb8fa03c7, 0x8e7a9b7a,
1157	0x87717a71, 0xb8faf807,
1158	0x867aff7a, 0x00007fff,
1159	0xb97af807, 0xbef2007e,
1160	0xbef3007f, 0xbefe0180,
1161	0xbf900004, 0xbf8e0002,
1162	0xbf88fffe, 0xbef8007e,
1163	0x8679ff7f, 0x0000ffff,
1164	0x8779ff79, 0x00040000,
1165	0xbefa0080, 0xbefb00ff,
1166	0x00807fac, 0x867aff7f,
1167	0x08000000, 0x8f7a837a,
1168	0x877b7a7b, 0x867aff7f,
1169	0x70000000, 0x8f7a817a,
1170	0x877b7a7b, 0xbeef007c,
1171	0xbeee0080, 0xb8ee2a05,
1172	0x806e816e, 0x8e6e8a6e,
1173	0xb8fa1605, 0x807a817a,
1174	0x8e7a867a, 0x806e7a6e,
1175	0xbefa0084, 0xbefa00ff,
1176	0x01000000, 0xbefe007c,
1177	0xbefc006e, 0xc0611bfc,
1178	0x0000007c, 0x806e846e,
1179	0xbefc007e, 0xbefe007c,
1180	0xbefc006e, 0xc0611c3c,
1181	0x0000007c, 0x806e846e,
1182	0xbefc007e, 0xbefe007c,
1183	0xbefc006e, 0xc0611c7c,
1184	0x0000007c, 0x806e846e,
1185	0xbefc007e, 0xbefe007c,
1186	0xbefc006e, 0xc0611cbc,
1187	0x0000007c, 0x806e846e,
1188	0xbefc007e, 0xbefe007c,
1189	0xbefc006e, 0xc0611cfc,
1190	0x0000007c, 0x806e846e,
1191	0xbefc007e, 0xbefe007c,
1192	0xbefc006e, 0xc0611d3c,
1193	0x0000007c, 0x806e846e,
1194	0xbefc007e, 0xb8f5f803,
1195	0xbefe007c, 0xbefc006e,
1196	0xc0611d7c, 0x0000007c,
1197	0x806e846e, 0xbefc007e,
1198	0xbefe007c, 0xbefc006e,
1199	0xc0611dbc, 0x0000007c,
1200	0x806e846e, 0xbefc007e,
1201	0xbefe007c, 0xbefc006e,
1202	0xc0611dfc, 0x0000007c,
1203	0x806e846e, 0xbefc007e,
1204	0xb8eff801, 0xbefe007c,
1205	0xbefc006e, 0xc0611bfc,
1206	0x0000007c, 0x806e846e,
1207	0xbefc007e, 0xbefe007c,
1208	0xbefc006e, 0xc0611b3c,
1209	0x0000007c, 0x806e846e,
1210	0xbefc007e, 0xbefe007c,
1211	0xbefc006e, 0xc0611b7c,
1212	0x0000007c, 0x806e846e,
1213	0xbefc007e, 0x867aff7f,
1214	0x04000000, 0xbef30080,
1215	0x8773737a, 0xb8ee2a05,
1216	0x806e816e, 0x8e6e8a6e,
1217	0xb8f51605, 0x80758175,
1218	0x8e758475, 0x8e7a8275,
1219	0xbefa00ff, 0x01000000,
1220	0xbef60178, 0x80786e78,
1221	0x82798079, 0xbefc0080,
1222	0xbe802b00, 0xbe822b02,
1223	0xbe842b04, 0xbe862b06,
1224	0xbe882b08, 0xbe8a2b0a,
1225	0xbe8c2b0c, 0xbe8e2b0e,
1226	0xc06b003c, 0x00000000,
1227	0xc06b013c, 0x00000010,
1228	0xc06b023c, 0x00000020,
1229	0xc06b033c, 0x00000030,
1230	0x8078c078, 0x82798079,
1231	0x807c907c, 0xbf0a757c,
1232	0xbf85ffeb, 0xbef80176,
1233	0xbeee0080, 0xbefe00c1,
1234	0xbeff00c1, 0xbefa00ff,
1235	0x01000000, 0xe0724000,
1236	0x6e1e0000, 0xe0724100,
1237	0x6e1e0100, 0xe0724200,
1238	0x6e1e0200, 0xe0724300,
1239	0x6e1e0300, 0xbefe00c1,
1240	0xbeff00c1, 0xb8f54306,
1241	0x8675c175, 0xbf84002c,
1242	0xbf8a0000, 0x867aff73,
1243	0x04000000, 0xbf840028,
1244	0x8e758675, 0x8e758275,
1245	0xbefa0075, 0xb8ee2a05,
1246	0x806e816e, 0x8e6e8a6e,
1247	0xb8fa1605, 0x807a817a,
1248	0x8e7a867a, 0x806e7a6e,
1249	0x806eff6e, 0x00000080,
1250	0xbefa00ff, 0x01000000,
1251	0xbefc0080, 0xd28c0002,
1252	0x000100c1, 0xd28d0003,
1253	0x000204c1, 0xd1060002,
1254	0x00011103, 0x7e0602ff,
1255	0x00000200, 0xbefc00ff,
1256	0x00010000, 0xbe80007b,
1257	0x867bff7b, 0xff7fffff,
1258	0x877bff7b, 0x00058000,
1259	0xd8ec0000, 0x00000002,
1260	0xbf8c007f, 0xe0765000,
1261	0x6e1e0002, 0x32040702,
1262	0xd0c9006a, 0x0000eb02,
1263	0xbf87fff7, 0xbefb0000,
1264	0xbeee00ff, 0x00000400,
1265	0xbefe00c1, 0xbeff00c1,
1266	0xb8f52a05, 0x80758175,
1267	0x8e758275, 0x8e7a8875,
1268	0xbefa00ff, 0x01000000,
1269	0xbefc0084, 0xbf0a757c,
1270	0xbf840015, 0xbf11017c,
1271	0x8075ff75, 0x00001000,
1272	0x7e000300, 0x7e020301,
1273	0x7e040302, 0x7e060303,
1274	0xe0724000, 0x6e1e0000,
1275	0xe0724100, 0x6e1e0100,
1276	0xe0724200, 0x6e1e0200,
1277	0xe0724300, 0x6e1e0300,
1278	0x807c847c, 0x806eff6e,
1279	0x00000400, 0xbf0a757c,
1280	0xbf85ffef, 0xbf9c0000,
1281	0xbf8200ca, 0xbef8007e,
1282	0x8679ff7f, 0x0000ffff,
1283	0x8779ff79, 0x00040000,
1284	0xbefa0080, 0xbefb00ff,
1285	0x00807fac, 0x8676ff7f,
1286	0x08000000, 0x8f768376,
1287	0x877b767b, 0x8676ff7f,
1288	0x70000000, 0x8f768176,
1289	0x877b767b, 0x8676ff7f,
1290	0x04000000, 0xbf84001e,
1291	0xbefe00c1, 0xbeff00c1,
1292	0xb8f34306, 0x8673c173,
1293	0xbf840019, 0x8e738673,
1294	0x8e738273, 0xbefa0073,
1295	0xb8f22a05, 0x80728172,
1296	0x8e728a72, 0xb8f61605,
1297	0x80768176, 0x8e768676,
1298	0x80727672, 0x8072ff72,
1299	0x00000080, 0xbefa00ff,
1300	0x01000000, 0xbefc0080,
1301	0xe0510000, 0x721e0000,
1302	0xe0510100, 0x721e0000,
1303	0x807cff7c, 0x00000200,
1304	0x8072ff72, 0x00000200,
1305	0xbf0a737c, 0xbf85fff6,
1306	0xbef20080, 0xbefe00c1,
1307	0xbeff00c1, 0xb8f32a05,
1308	0x80738173, 0x8e738273,
1309	0x8e7a8873, 0xbefa00ff,
1310	0x01000000, 0xbef60072,
1311	0x8072ff72, 0x00000400,
1312	0xbefc0084, 0xbf11087c,
1313	0x8073ff73, 0x00008000,
1314	0xe0524000, 0x721e0000,
1315	0xe0524100, 0x721e0100,
1316	0xe0524200, 0x721e0200,
1317	0xe0524300, 0x721e0300,
1318	0xbf8c0f70, 0x7e000300,
1319	0x7e020301, 0x7e040302,
1320	0x7e060303, 0x807c847c,
1321	0x8072ff72, 0x00000400,
1322	0xbf0a737c, 0xbf85ffee,
1323	0xbf9c0000, 0xe0524000,
1324	0x761e0000, 0xe0524100,
1325	0x761e0100, 0xe0524200,
1326	0x761e0200, 0xe0524300,
1327	0x761e0300, 0xb8f22a05,
1328	0x80728172, 0x8e728a72,
1329	0xb8f61605, 0x80768176,
1330	0x8e768676, 0x80727672,
1331	0x80f2c072, 0xb8f31605,
1332	0x80738173, 0x8e738473,
1333	0x8e7a8273, 0xbefa00ff,
1334	0x01000000, 0xbefc0073,
1335	0xc031003c, 0x00000072,
1336	0x80f2c072, 0xbf8c007f,
1337	0x80fc907c, 0xbe802d00,
1338	0xbe822d02, 0xbe842d04,
1339	0xbe862d06, 0xbe882d08,
1340	0xbe8a2d0a, 0xbe8c2d0c,
1341	0xbe8e2d0e, 0xbf06807c,
1342	0xbf84fff1, 0xb8f22a05,
1343	0x80728172, 0x8e728a72,
1344	0xb8f61605, 0x80768176,
1345	0x8e768676, 0x80727672,
1346	0xbefa0084, 0xbefa00ff,
1347	0x01000000, 0xc0211cfc,
1348	0x00000072, 0x80728472,
1349	0xc0211c3c, 0x00000072,
1350	0x80728472, 0xc0211c7c,
1351	0x00000072, 0x80728472,
1352	0xc0211bbc, 0x00000072,
1353	0x80728472, 0xc0211bfc,
1354	0x00000072, 0x80728472,
1355	0xc0211d3c, 0x00000072,
1356	0x80728472, 0xc0211d7c,
1357	0x00000072, 0x80728472,
1358	0xc0211a3c, 0x00000072,
1359	0x80728472, 0xc0211a7c,
1360	0x00000072, 0x80728472,
1361	0xc0211dfc, 0x00000072,
1362	0x80728472, 0xc0211b3c,
1363	0x00000072, 0x80728472,
1364	0xc0211b7c, 0x00000072,
1365	0x80728472, 0xbf8c007f,
1366	0x8671ff71, 0x0000ffff,
1367	0xbefc0073, 0xbefe006e,
1368	0xbeff006f, 0x867375ff,
1369	0x000003ff, 0xb9734803,
1370	0x867375ff, 0xfffff800,
1371	0x8f738b73, 0xb973a2c3,
1372	0xb977f801, 0x8673ff71,
1373	0xf0000000, 0x8f739c73,
1374	0x8e739073, 0xbef60080,
1375	0x87767376, 0x8673ff71,
1376	0x08000000, 0x8f739b73,
1377	0x8e738f73, 0x87767376,
1378	0x8673ff74, 0x00800000,
1379	0x8f739773, 0xb976f807,
1380	0x86fe7e7e, 0x86ea6a6a,
1381	0xb974f802, 0xbf8a0000,
1382	0x95807370, 0xbf810000,
1383};
1384
v5.9
  1/*
  2 * Copyright 2015-2017 Advanced Micro Devices, Inc.
  3 *
  4 * Permission is hereby granted, free of charge, to any person obtaining a
  5 * copy of this software and associated documentation files (the "Software"),
  6 * to deal in the Software without restriction, including without limitation
  7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 * and/or sell copies of the Software, and to permit persons to whom the
  9 * Software is furnished to do so, subject to the following conditions:
 10 *
 11 * The above copyright notice and this permission notice shall be included in
 12 * all copies or substantial portions of the Software.
 13 *
 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 20 * OTHER DEALINGS IN THE SOFTWARE.
 21 */
 22
 23/* To compile this assembly code:
 24 * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex
 25 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 26
 27/**************************************************************************/
 28/*                      variables                                         */
 29/**************************************************************************/
 30var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
 31var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
 32var SQ_WAVE_STATUS_SPI_PRIO_SHIFT  = 1
 33var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
 34var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT   = 0
 35var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE    = 1
 36var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT  = 3
 37var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE   = 29
 38
 39var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
 40var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
 41var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
 42var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
 43var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
 44var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 3                     //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
 45
 46var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =   0x400
 47var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF                   // Exception mask
 48var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =   10
 49var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =   0x100
 50var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =   8
 51var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK    =   0x3FF
 52var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT   =   0x0
 53var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE    =   10
 54var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK   =   0xFFFFF800
 55var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT  =   11
 56var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE   =   21
 57
 58var SQ_WAVE_IB_STS_RCNT_SHIFT           =   16                  //FIXME
 59var SQ_WAVE_IB_STS_RCNT_SIZE            =   4                   //FIXME
 60var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =   15                  //FIXME
 61var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
 62var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF    //FIXME
 63
 64var SQ_BUF_RSRC_WORD1_ATC_SHIFT     =   24
 65var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =   27
 66
 67
 68/*      Save        */
 69var S_SAVE_BUF_RSRC_WORD1_STRIDE        =   0x00040000          //stride is 4 bytes
 70var S_SAVE_BUF_RSRC_WORD3_MISC          =   0x00807FAC          //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
 71
 72var S_SAVE_SPI_INIT_ATC_MASK            =   0x08000000          //bit[27]: ATC bit
 73var S_SAVE_SPI_INIT_ATC_SHIFT           =   27
 74var S_SAVE_SPI_INIT_MTYPE_MASK          =   0x70000000          //bit[30:28]: Mtype
 75var S_SAVE_SPI_INIT_MTYPE_SHIFT         =   28
 76var S_SAVE_SPI_INIT_FIRST_WAVE_MASK     =   0x04000000          //bit[26]: FirstWaveInTG
 77var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT    =   26
 78
 79var S_SAVE_PC_HI_RCNT_SHIFT             =   28                  //FIXME  check with Brian to ensure all fields other than PC[47:0] can be used
 80var S_SAVE_PC_HI_RCNT_MASK              =   0xF0000000          //FIXME
 81var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT     =   27                  //FIXME
 82var S_SAVE_PC_HI_FIRST_REPLAY_MASK      =   0x08000000          //FIXME
 83
 84var s_save_spi_init_lo              =   exec_lo
 85var s_save_spi_init_hi              =   exec_hi
 86
 87                                                //tba_lo and tba_hi need to be saved/restored
 88var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
 89var s_save_pc_hi            =   ttmp1
 90var s_save_exec_lo          =   ttmp2
 91var s_save_exec_hi          =   ttmp3
 92var s_save_status           =   ttmp4
 93var s_save_trapsts          =   ttmp5           //not really used until the end of the SAVE routine
 94var s_save_xnack_mask_lo    =   ttmp6
 95var s_save_xnack_mask_hi    =   ttmp7
 96var s_save_buf_rsrc0        =   ttmp8
 97var s_save_buf_rsrc1        =   ttmp9
 98var s_save_buf_rsrc2        =   ttmp10
 99var s_save_buf_rsrc3        =   ttmp11
100
101var s_save_mem_offset       =   tma_lo
102var s_save_alloc_size       =   s_save_trapsts          //conflict
103var s_save_tmp              =   s_save_buf_rsrc2        //shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
104var s_save_m0               =   tma_hi
105
106/*      Restore     */
107var S_RESTORE_BUF_RSRC_WORD1_STRIDE         =   S_SAVE_BUF_RSRC_WORD1_STRIDE
108var S_RESTORE_BUF_RSRC_WORD3_MISC           =   S_SAVE_BUF_RSRC_WORD3_MISC
109
110var S_RESTORE_SPI_INIT_ATC_MASK             =   0x08000000          //bit[27]: ATC bit
111var S_RESTORE_SPI_INIT_ATC_SHIFT            =   27
112var S_RESTORE_SPI_INIT_MTYPE_MASK           =   0x70000000          //bit[30:28]: Mtype
113var S_RESTORE_SPI_INIT_MTYPE_SHIFT          =   28
114var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      =   0x04000000          //bit[26]: FirstWaveInTG
115var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     =   26
116
117var S_RESTORE_PC_HI_RCNT_SHIFT              =   S_SAVE_PC_HI_RCNT_SHIFT
118var S_RESTORE_PC_HI_RCNT_MASK               =   S_SAVE_PC_HI_RCNT_MASK
119var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT      =   S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
120var S_RESTORE_PC_HI_FIRST_REPLAY_MASK       =   S_SAVE_PC_HI_FIRST_REPLAY_MASK
121
122var s_restore_spi_init_lo                   =   exec_lo
123var s_restore_spi_init_hi                   =   exec_hi
124
125var s_restore_mem_offset        =   ttmp2
126var s_restore_alloc_size        =   ttmp3
127var s_restore_tmp               =   ttmp6               //tba_lo/hi need to be restored
128var s_restore_mem_offset_save   =   s_restore_tmp       //no conflict
129
130var s_restore_m0            =   s_restore_alloc_size    //no conflict
131
132var s_restore_mode          =   ttmp7
133
134var s_restore_pc_lo         =   ttmp0
135var s_restore_pc_hi         =   ttmp1
136var s_restore_exec_lo       =   tma_lo                  //no conflict
137var s_restore_exec_hi       =   tma_hi                  //no conflict
138var s_restore_status        =   ttmp4
139var s_restore_trapsts       =   ttmp5
140var s_restore_xnack_mask_lo =   xnack_mask_lo
141var s_restore_xnack_mask_hi =   xnack_mask_hi
142var s_restore_buf_rsrc0     =   ttmp8
143var s_restore_buf_rsrc1     =   ttmp9
144var s_restore_buf_rsrc2     =   ttmp10
145var s_restore_buf_rsrc3     =   ttmp11
146
147/**************************************************************************/
148/*                      trap handler entry points                         */
149/**************************************************************************/
150/* Shader Main*/
151
152shader main
153  asic(VI)
154  type(CS)
155
156
 
 
 
 
 
 
 
 
157        s_branch L_SKIP_RESTORE                                     //NOT restore. might be a regular trap or save
 
158
159L_JUMP_TO_RESTORE:
160    s_branch L_RESTORE                                              //restore
161
162L_SKIP_RESTORE:
163
164    s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                             //save STATUS since we will change SCC
165    s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
166    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
167    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
168    s_cbranch_scc1  L_SAVE                                      //this is the operation for save
169
170    // *********    Handle non-CWSR traps       *******************
171
172    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
173    s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
174    s_waitcnt lgkmcnt(0)
175    s_or_b32        ttmp7, ttmp8, ttmp9
176    s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
177    set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
178    s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler
179
180L_NO_NEXT_TRAP:
181    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
182    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
183    s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
184    s_add_u32       ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
185    s_addc_u32  ttmp1, ttmp1, 0
186L_EXCP_CASE:
187    s_and_b32   ttmp1, ttmp1, 0xFFFF
188    set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
189    s_rfe_b64       [ttmp0, ttmp1]
190
191    // *********        End handling of non-CWSR traps   *******************
192
193/**************************************************************************/
194/*                      save routine                                      */
195/**************************************************************************/
196
197L_SAVE:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198    s_mov_b32       s_save_tmp, 0                                                           //clear saveCtx bit
199    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
200
201    s_mov_b32       s_save_xnack_mask_lo,   xnack_mask_lo                                   //save XNACK_MASK
202    s_mov_b32       s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
203    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)                   //save RCNT
204    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
205    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
206    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
207    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
208    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
209    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                                        //clear RCNT and FIRST_REPLAY in IB_STS
210    s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
211
212    s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
213
214    /*      inform SPI the readiness and wait for SPI's go signal */
215    s_mov_b32       s_save_exec_lo, exec_lo                                                 //save EXEC and use EXEC for the go signal from SPI
216    s_mov_b32       s_save_exec_hi, exec_hi
217    s_mov_b64       exec,   0x0                                                             //clear EXEC to get ready to receive
218
 
 
 
 
 
 
 
 
219        s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
220
221    // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
222    s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
223    s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
224
225  L_SLEEP:
226    s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
227
 
 
 
228        s_cbranch_execz L_SLEEP
 
 
 
 
 
 
229
230    /*      setup Resource Contants    */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231    s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                                                      //base_addr_lo
232    s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                                          //base_addr_hi
233    s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
234    s_mov_b32       s_save_buf_rsrc2,   0                                                                       //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
235    s_mov_b32       s_save_buf_rsrc3,   S_SAVE_BUF_RSRC_WORD3_MISC
236    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
237    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)         //get ATC bit into position
238    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or ATC
239    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
240    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)     //get MTYPE bits into position
241    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or MTYPE
242
243    //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
244    s_mov_b32       s_save_m0,          m0                                                                  //save M0
245
246    /*      global mem offset           */
247    s_mov_b32       s_save_mem_offset,  0x0                                                                     //mem offset initial value = 0
248
249
250
251
252    /*      save HW registers   */
253    //////////////////////////////
254
255  L_SAVE_HWREG:
256        // HWREG SR memory offset : size(VGPR)+size(SGPR)
257       get_vgpr_size_bytes(s_save_mem_offset)
258       get_sgpr_size_bytes(s_save_tmp)
259       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
260
261
262    s_mov_b32       s_save_buf_rsrc2, 0x4                               //NUM_RECORDS   in bytes
 
 
 
263        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
 
264
265
266    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)                  //M0
 
 
 
 
 
 
 
 
267    write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)                   //PC
268    write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
269    write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)             //EXEC
270    write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
271    write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)              //STATUS
272
273    //s_save_trapsts conflicts with s_save_alloc_size
274    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
275    write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)             //TRAPSTS
276
277    write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_LO
278    write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_HI
279
280    //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
281    s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                   //MODE
282    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
283    write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_LO
284    write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_HI
285
286
287
288    /*      the first wave in the threadgroup    */
289        // save fist_wave bits in tba_hi unused bit.26
290    s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK     // extract fisrt wave bit
291    //s_or_b32        tba_hi, s_save_tmp, tba_hi                                        // save first wave bit to tba_hi.bits[26]
292    s_mov_b32        s_save_exec_hi, 0x0
293    s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi                          // save first wave bit to s_save_exec_hi.bits[26]
294
295
296    /*          save SGPRs      */
297        // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
298    //////////////////////////////
299
300    // SGPR SR memory offset : size(VGPR)
301    get_vgpr_size_bytes(s_save_mem_offset)
302    // TODO, change RSRC word to rearrange memory layout for SGPRS
303
304    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)               //spgr_size
305    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
306    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4                         //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
307
 
308        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 2                    //NUM_RECORDS in bytes
 
 
 
 
 
 
 
309        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
 
 
310
311    // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
312    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
313    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
314    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
315    s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
316
317    s_mov_b32       m0, 0x0                         //SGPR initial index value =0
318  L_SAVE_SGPR_LOOP:
319    // SGPR is allocated in 16 SGPR granularity
320    s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
321    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
322    s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
323    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
324    s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
325    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
326    s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
327    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
328
329    write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
330    s_add_u32       m0, m0, 16                                                      //next sgpr index
331    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
332    s_cbranch_scc1  L_SAVE_SGPR_LOOP                                    //SGPR save is complete?
333    // restore s_save_buf_rsrc0,1
334    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
335    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
336
337
338
339
340    /*          save first 4 VGPR, then LDS save could use   */
341        // each wave will alloc 4 vgprs at least...
342    /////////////////////////////////////////////////////////////////////////////////////
343
344    s_mov_b32       s_save_mem_offset, 0
345    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
346    s_mov_b32       exec_hi, 0xFFFFFFFF
347
 
 
 
348        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
 
 
349
350    // VGPR Allocated in 4-GPR granularity
351
 
 
 
 
 
 
 
 
 
 
352        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
353        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
354        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
355        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
 
356
357
358
359    /*          save LDS        */
360    //////////////////////////////
361
362  L_SAVE_LDS:
363
364        // Change EXEC to all threads...
365    s_mov_b32       exec_lo, 0xFFFFFFFF   //need every thread from now on
366    s_mov_b32       exec_hi, 0xFFFFFFFF
367
368    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)             //lds_size
369    s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                //lds_size is zero?
370    s_cbranch_scc0  L_SAVE_LDS_DONE                                                                            //no lds used? jump to L_SAVE_DONE
371
372    s_barrier               //LDS is used? wait for other waves in the same TG
373    //s_and_b32     s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
374    s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
375    s_cbranch_scc0  L_SAVE_LDS_DONE
376
377        // first wave do LDS save;
378
379    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6                         //LDS size in dwords = lds_size * 64dw
380    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //LDS size in bytes
381    s_mov_b32       s_save_buf_rsrc2,  s_save_alloc_size                            //NUM_RECORDS in bytes
382
383    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
384    //
385    get_vgpr_size_bytes(s_save_mem_offset)
386    get_sgpr_size_bytes(s_save_tmp)
387    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
388    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
389
390
 
 
 
391        s_mov_b32       s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
 
 
392    s_mov_b32       m0, 0x0                                               //lds_offset initial value = 0
393
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
396      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
397      v_mul_i32_i24 v2, v3, 8   // tid*8
398      v_mov_b32 v3, 256*2
399      s_mov_b32 m0, 0x10000
400      s_mov_b32 s0, s_save_buf_rsrc3
401      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid
402      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
403
404L_SAVE_LDS_LOOP_VECTOR:
405      ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
406      s_waitcnt lgkmcnt(0)
407      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
408//      s_waitcnt vmcnt(0)
409      v_add_u32 v2, vcc[0:1], v2, v3
410      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
411      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
412
413      // restore rsrc3
414      s_mov_b32 s_save_buf_rsrc3, s0
415
 
 
416L_SAVE_LDS_DONE:
417
418
419    /*          save VGPRs  - set the Rest VGPRs        */
420    //////////////////////////////////////////////////////////////////////////////////////
421  L_SAVE_VGPR:
422    // VGPR SR memory offset: 0
423    // TODO rearrange the RSRC words to use swizzle for VGPR save...
424
425    s_mov_b32       s_save_mem_offset, (0+256*4)                                    // for the rest VGPRs
426    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
427    s_mov_b32       exec_hi, 0xFFFFFFFF
428
429    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)                   //vpgr_size
430    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
431    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
432    s_lshl_b32      s_save_buf_rsrc2,  s_save_alloc_size, 8                         //NUM_RECORDS in bytes (64 threads*4)
 
 
 
433        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
 
 
435    // VGPR store using dw burst
436    s_mov_b32         m0, 0x4   //VGPR initial index value =0
437    s_cmp_lt_u32      m0, s_save_alloc_size
438    s_cbranch_scc0    L_SAVE_VGPR_END
439
440
441    s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
442    s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000                    //add 0x1000 since we compare m0 against it later
443
444  L_SAVE_VGPR_LOOP:
445    v_mov_b32       v0, v0              //v0 = v[0+m0]
446    v_mov_b32       v1, v1              //v0 = v[0+m0]
447    v_mov_b32       v2, v2              //v0 = v[0+m0]
448    v_mov_b32       v3, v3              //v0 = v[0+m0]
449
 
 
 
450        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
451        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
452        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
453        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
 
454
455    s_add_u32       m0, m0, 4                                                       //next vgpr index
456    s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4                     //every buffer_store_dword does 256 bytes
457    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
458    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
459    s_set_gpr_idx_off
 
460
461L_SAVE_VGPR_END:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462    s_branch    L_END_PGM
463
464
465
466/**************************************************************************/
467/*                      restore routine                                   */
468/**************************************************************************/
469
470L_RESTORE:
471    /*      Setup Resource Contants    */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472    s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                           //base_addr_lo
473    s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                               //base_addr_hi
474    s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
475    s_mov_b32       s_restore_buf_rsrc2,    0                                                                               //NUM_RECORDS initial value = 0 (in bytes)
476    s_mov_b32       s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
477    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
478    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)       //get ATC bit into position
479    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or ATC
480    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
481    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
482    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or MTYPE
483
484    /*      global mem offset           */
485//  s_mov_b32       s_restore_mem_offset, 0x0                               //mem offset initial value = 0
486
487    /*      the first wave in the threadgroup    */
488    s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
489    s_cbranch_scc0  L_RESTORE_VGPR
490
491    /*          restore LDS     */
492    //////////////////////////////
493  L_RESTORE_LDS:
494
495    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
496    s_mov_b32       exec_hi, 0xFFFFFFFF
497
498    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)              //lds_size
499    s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                  //lds_size is zero?
500    s_cbranch_scc0  L_RESTORE_VGPR                                                          //no lds used? jump to L_RESTORE_VGPR
501    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6                           //LDS size in dwords = lds_size * 64dw
502    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //LDS size in bytes
503    s_mov_b32       s_restore_buf_rsrc2,    s_restore_alloc_size                            //NUM_RECORDS in bytes
504
505    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
506    //
507    get_vgpr_size_bytes(s_restore_mem_offset)
508    get_sgpr_size_bytes(s_restore_tmp)
509    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
510    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
511
512
 
 
 
513        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
 
514    s_mov_b32       m0, 0x0                                                                 //lds_offset initial value = 0
515
516  L_RESTORE_LDS_LOOP:
 
517        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
518        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
 
519    s_add_u32       m0, m0, 256*2                                               // 128 DW
520    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
521    s_cmp_lt_u32    m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
522    s_cbranch_scc1  L_RESTORE_LDS_LOOP                                                      //LDS restore is complete?
523
524
525    /*          restore VGPRs       */
526    //////////////////////////////
527  L_RESTORE_VGPR:
528        // VGPR SR memory offset : 0
529    s_mov_b32       s_restore_mem_offset, 0x0
530    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
531    s_mov_b32       exec_hi, 0xFFFFFFFF
532
533    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)    //vpgr_size
534    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
535    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
536    s_lshl_b32      s_restore_buf_rsrc2,  s_restore_alloc_size, 8                           //NUM_RECORDS in bytes (64 threads*4)
 
 
 
537        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
 
539    // VGPR load using dw burst
540    s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
541    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
542    s_mov_b32       m0, 4                               //VGPR initial index value = 1
543    s_set_gpr_idx_on  m0, 0x8                       //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
544    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000                      //add 0x8000 since we compare m0 against it later
545
546  L_RESTORE_VGPR_LOOP:
 
 
 
547        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
548        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
549        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
550        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
 
551    s_waitcnt       vmcnt(0)                                                                //ensure data ready
552    v_mov_b32       v0, v0                                                                  //v[0+m0] = v0
553    v_mov_b32       v1, v1
554    v_mov_b32       v2, v2
555    v_mov_b32       v3, v3
556    s_add_u32       m0, m0, 4                                                               //next vgpr index
557    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4                           //every buffer_load_dword does 256 bytes
558    s_cmp_lt_u32    m0, s_restore_alloc_size                                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
559    s_cbranch_scc1  L_RESTORE_VGPR_LOOP                                                     //VGPR restore (except v0) is complete?
560    s_set_gpr_idx_off
561                                                                                            /* VGPR restore on v0 */
 
 
 
562        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
563        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
564        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
565        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
 
 
 
566
567    /*          restore SGPRs       */
568    //////////////////////////////
569
570    // SGPR SR memory offset : size(VGPR)
571    get_vgpr_size_bytes(s_restore_mem_offset)
572    get_sgpr_size_bytes(s_restore_tmp)
573    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
574    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
575    // TODO, change RSRC word to rearrange memory layout for SGPRS
576
577    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                //spgr_size
578    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
579    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4                           //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
580
 
581        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 2                     //NUM_RECORDS in bytes
 
 
 
 
 
 
582        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
 
583
584    /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
585       However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
586    */
587    s_mov_b32 m0, s_restore_alloc_size
588
589 L_RESTORE_SGPR_LOOP:
590    read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
591    s_waitcnt       lgkmcnt(0)                                                              //ensure data ready
592
593    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
594
595    s_movreld_b64   s0, s0      //s[0+m0] = s0
596    s_movreld_b64   s2, s2
597    s_movreld_b64   s4, s4
598    s_movreld_b64   s6, s6
599    s_movreld_b64   s8, s8
600    s_movreld_b64   s10, s10
601    s_movreld_b64   s12, s12
602    s_movreld_b64   s14, s14
603
604    s_cmp_eq_u32    m0, 0               //scc = (m0 < s_restore_alloc_size) ? 1 : 0
605    s_cbranch_scc0  L_RESTORE_SGPR_LOOP             //SGPR restore (except s0) is complete?
606
607    /*      restore HW registers    */
608    //////////////////////////////
609  L_RESTORE_HWREG:
610
 
 
 
 
 
 
611    // HWREG SR memory offset : size(VGPR)+size(SGPR)
612    get_vgpr_size_bytes(s_restore_mem_offset)
613    get_sgpr_size_bytes(s_restore_tmp)
614    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
615
616
617    s_mov_b32       s_restore_buf_rsrc2, 0x4                                                //NUM_RECORDS   in bytes
 
 
 
618        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
 
619
620    read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)                    //M0
621    read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)             //PC
622    read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
623    read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)               //EXEC
624    read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
625    read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)                //STATUS
626    read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)               //TRAPSTS
627    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_LO
628    read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_HI
629    read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)              //MODE
630    read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_LO
631    read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_HI
632
633    s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
634
 
 
 
 
 
 
 
 
 
 
 
 
635    s_mov_b32       m0,         s_restore_m0
636    s_mov_b32       exec_lo,    s_restore_exec_lo
637    s_mov_b32       exec_hi,    s_restore_exec_hi
638
639    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
640    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
641    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
642    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
643    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
644    //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
645    s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
646    //reuse s_restore_m0 as a temp register
647    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
648    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
649    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
650    s_mov_b32       s_restore_tmp, 0x0                                                                              //IB_STS is zero
651    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
652    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
653    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
654    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
655    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
656    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
657    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
658    s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
659
660    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
661    s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
662    s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
663    set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
664
665    s_barrier                                                   //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
666
 
 
 
 
 
667//  s_rfe_b64 s_restore_pc_lo                                   //Return to the main shader program and resume execution
668    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc
669
670
671/**************************************************************************/
672/*                      the END                                           */
673/**************************************************************************/
674L_END_PGM:
675    s_endpgm
676
677end
678
679
680/**************************************************************************/
681/*                      the helper functions                              */
682/**************************************************************************/
683
684//Only for save hwreg to mem
685function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
686        s_mov_b32 exec_lo, m0                   //assuming exec_lo is not needed anymore from this point on
687        s_mov_b32 m0, s_mem_offset
688        s_buffer_store_dword s, s_rsrc, m0      glc:1
689        s_add_u32       s_mem_offset, s_mem_offset, 4
690        s_mov_b32   m0, exec_lo
691end
692
693
694// HWREG are saved before SGPRs, so all HWREG could be use.
695function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
696
697        s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:1
698        s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:1
699        s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:1
700        s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
701        s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
702        s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0             // +scc
703end
704
705
706function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
707    s_buffer_load_dword s, s_rsrc, s_mem_offset     glc:1
708    s_add_u32       s_mem_offset, s_mem_offset, 4
709end
710
711function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
712    s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset      glc:1
713    s_sub_u32       s_mem_offset, s_mem_offset, 4*16
714end
715
716
717
718function get_lds_size_bytes(s_lds_size_byte)
719    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
720    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)          // lds_size
721    s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8                      //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
722end
723
724function get_vgpr_size_bytes(s_vgpr_size_byte)
725    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
726    s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
727    s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible
728end
729
730function get_sgpr_size_bytes(s_sgpr_size_byte)
731    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
732    s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
733    s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
734end
735
736function get_hwreg_size_bytes
737    return 128 //HWREG size 128 bytes
738end
739
740function set_status_without_spi_prio(status, tmp)
741    // Do not restore STATUS.SPI_PRIO since scheduler may have raised it.
742    s_lshr_b32      tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT
743    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp
744    s_nop           0x2 // avoid S_SETREG => S_SETREG hazard
745    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status
746end