Loading...
Note: File does not exist in v3.1.
1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23/* To compile this assembly code:
24 *
25 * gfx12:
26 * cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx12.asm -P -o gfx12.sp3
27 * sp3 gfx12.sp3 -hex gfx12.hex
28 */
29
30#define CHIP_GFX12 37
31
32#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised
33
34var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4
35var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9
36var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00
37var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000
38var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK = 0x8000
39var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15
40var SQ_WAVE_STATUS_WAVE64_SHIFT = 29
41var SQ_WAVE_STATUS_WAVE64_SIZE = 1
42var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24
43var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
44var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000
45
46var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
47var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
48var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8
49var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12
50var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
51var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
52var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9
53
54var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF
55var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10
56var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5
57var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20
58var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40
59var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6
60var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80
61var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT = 7
62var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK = 0x100
63var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT = 8
64var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200
65var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800
66var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80
67var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK = 0x200
68
69var SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK= SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK |\
70 SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK |\
71 SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK |\
72 SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK |\
73 SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK |\
74 SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK
75var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT
76var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
77var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
78var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
79var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE = 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT
80var BARRIER_STATE_SIGNAL_OFFSET = 16
81var BARRIER_STATE_VALID_OFFSET = 0
82
83var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23
84var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000
85
86// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
87// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
88var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000
89var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC
90var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
91var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
92
93var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000
94var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31
95
96var s_sgpr_save_num = 108
97
98var s_save_spi_init_lo = exec_lo
99var s_save_spi_init_hi = exec_hi
100var s_save_pc_lo = ttmp0
101var s_save_pc_hi = ttmp1
102var s_save_exec_lo = ttmp2
103var s_save_exec_hi = ttmp3
104var s_save_state_priv = ttmp12
105var s_save_excp_flag_priv = ttmp15
106var s_save_xnack_mask = s_save_excp_flag_priv
107var s_wave_size = ttmp7
108var s_save_buf_rsrc0 = ttmp8
109var s_save_buf_rsrc1 = ttmp9
110var s_save_buf_rsrc2 = ttmp10
111var s_save_buf_rsrc3 = ttmp11
112var s_save_mem_offset = ttmp4
113var s_save_alloc_size = s_save_excp_flag_priv
114var s_save_tmp = ttmp14
115var s_save_m0 = ttmp5
116var s_save_ttmps_lo = s_save_tmp
117var s_save_ttmps_hi = s_save_excp_flag_priv
118
119var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
120var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
121
122var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
123var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
124var S_WAVE_SIZE = 25
125
126var s_restore_spi_init_lo = exec_lo
127var s_restore_spi_init_hi = exec_hi
128var s_restore_mem_offset = ttmp12
129var s_restore_alloc_size = ttmp3
130var s_restore_tmp = ttmp2
131var s_restore_mem_offset_save = s_restore_tmp
132var s_restore_m0 = s_restore_alloc_size
133var s_restore_mode = ttmp7
134var s_restore_flat_scratch = s_restore_tmp
135var s_restore_pc_lo = ttmp0
136var s_restore_pc_hi = ttmp1
137var s_restore_exec_lo = ttmp4
138var s_restore_exec_hi = ttmp5
139var s_restore_state_priv = ttmp14
140var s_restore_excp_flag_priv = ttmp15
141var s_restore_xnack_mask = ttmp13
142var s_restore_buf_rsrc0 = ttmp8
143var s_restore_buf_rsrc1 = ttmp9
144var s_restore_buf_rsrc2 = ttmp10
145var s_restore_buf_rsrc3 = ttmp11
146var s_restore_size = ttmp6
147var s_restore_ttmps_lo = s_restore_tmp
148var s_restore_ttmps_hi = s_restore_alloc_size
149var s_restore_spi_init_hi_save = s_restore_exec_hi
150
151shader main
152 asic(DEFAULT)
153 type(CS)
154 wave_size(32)
155
156 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
157
158L_JUMP_TO_RESTORE:
159 s_branch L_RESTORE
160
161L_SKIP_RESTORE:
162 s_getreg_b32 s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV) //save STATUS since we will change SCC
163
164 // Clear SPI_PRIO: do not save with elevated priority.
165 // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
166 s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK
167
168 s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
169
170 s_and_b32 ttmp2, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK
171 s_cbranch_scc0 L_NOT_HALTED
172
173L_HALTED:
174 // Host trap may occur while wave is halted.
175 s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
176 s_cbranch_scc1 L_FETCH_2ND_TRAP
177
178L_CHECK_SAVE:
179 s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK
180 s_cbranch_scc1 L_SAVE
181
182 // Wave is halted but neither host trap nor SAVECTX is raised.
183 // Caused by instruction fetch memory violation.
184 // Spin wait until context saved to prevent interrupt storm.
185 s_sleep 0x10
186 s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
187 s_branch L_CHECK_SAVE
188
189L_NOT_HALTED:
190 // Let second-level handle non-SAVECTX exception or trap.
191 // Any concurrent SAVECTX will be handled upon re-entry once halted.
192
193 // Check non-maskable exceptions. memory_violation, illegal_instruction
194 // and xnack_error exceptions always cause the wave to enter the trap
195 // handler.
196 s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK
197 s_cbranch_scc1 L_FETCH_2ND_TRAP
198
199 // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
200 // Maskable exceptions only cause the wave to enter the trap handler if
201 // their respective bit in mode.excp_en is set.
202 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
203 s_and_b32 ttmp3, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK
204 s_cbranch_scc0 L_NOT_ADDR_WATCH
205 s_or_b32 ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK
206
207L_NOT_ADDR_WATCH:
208 s_getreg_b32 ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL)
209 s_and_b32 ttmp2, ttmp3, ttmp2
210 s_cbranch_scc1 L_FETCH_2ND_TRAP
211
212L_CHECK_TRAP_ID:
213 // Check trap_id != 0
214 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
215 s_cbranch_scc1 L_FETCH_2ND_TRAP
216
217#if SINGLE_STEP_MISSED_WORKAROUND
218 // Prioritize single step exception over context save.
219 // Second-level trap will halt wave and RFE, re-entering for SAVECTX.
220 // WAVE_TRAP_CTRL is already in ttmp3.
221 s_and_b32 ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK
222 s_cbranch_scc1 L_FETCH_2ND_TRAP
223#endif
224
225 s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK
226 s_cbranch_scc1 L_SAVE
227
228L_FETCH_2ND_TRAP:
229 // Read second-level TBA/TMA from first-level TMA and jump if available.
230 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
231 // ttmp12 holds SQ_WAVE_STATUS
232 s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
233 s_wait_idle
234 s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
235
236 s_bitcmp1_b32 ttmp15, 0xF
237 s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA
238 s_or_b32 ttmp15, ttmp15, 0xFFFF0000
239L_NO_SIGN_EXTEND_TMA:
240
241 s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS // debug trap enabled flag
242 s_wait_idle
243 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
244 s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
245 s_or_b32 ttmp11, ttmp11, ttmp2
246
247 s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 scope:SCOPE_SYS // second-level TBA
248 s_wait_idle
249 s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 scope:SCOPE_SYS // second-level TMA
250 s_wait_idle
251
252 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
253 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
254 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
255
256L_NO_NEXT_TRAP:
257 // If not caused by trap then halt wave to prevent re-entry.
258 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
259 s_cbranch_scc1 L_TRAP_CASE
260
261 // Host trap will not cause trap re-entry.
262 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
263 s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
264 s_cbranch_scc1 L_EXIT_TRAP
265 s_or_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK
266
267 // If the PC points to S_ENDPGM then context save will fail if STATE_PRIV.HALT is set.
268 // Rewind the PC to prevent this from occurring.
269 s_sub_u32 ttmp0, ttmp0, 0x8
270 s_subb_u32 ttmp1, ttmp1, 0x0
271
272 s_branch L_EXIT_TRAP
273
274L_TRAP_CASE:
275 // Advance past trap instruction to prevent re-entry.
276 s_add_u32 ttmp0, ttmp0, 0x4
277 s_addc_u32 ttmp1, ttmp1, 0x0
278
279L_EXIT_TRAP:
280 s_and_b32 ttmp1, ttmp1, 0xFFFF
281
282 // Restore SQ_WAVE_STATUS.
283 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
284 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
285
286 // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
287 // Only restore fields which the trap handler changes.
288 s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
289 s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
290 SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv
291
292 s_rfe_b64 [ttmp0, ttmp1]
293
294L_SAVE:
295 // If VGPRs have been deallocated then terminate the wavefront.
296 // It has no remaining program to run and cannot save without VGPRs.
297 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
298 s_bitcmp1_b32 s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
299 s_cbranch_scc0 L_HAVE_VGPRS
300 s_endpgm
301L_HAVE_VGPRS:
302
303 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
304 s_mov_b32 s_save_tmp, 0
305 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit
306
307 /* inform SPI the readiness and wait for SPI's go signal */
308 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
309 s_mov_b32 s_save_exec_hi, exec_hi
310 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
311
312 s_sendmsg_rtn_b64 [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE)
313 s_wait_idle
314
315 // Save first_wave flag so we can clear high bits of save address.
316 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
317 s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
318 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
319
320 // Trap temporaries must be saved via VGPR but all VGPRs are in use.
321 // There is no ttmp space to hold the resource constant for VGPR save.
322 // Save v0 by itself since it requires only two SGPRs.
323 s_mov_b32 s_save_ttmps_lo, exec_lo
324 s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF
325 s_mov_b32 exec_lo, 0xFFFFFFFF
326 s_mov_b32 exec_hi, 0xFFFFFFFF
327 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS
328 v_mov_b32 v0, 0x0
329 s_mov_b32 exec_lo, s_save_ttmps_lo
330 s_mov_b32 exec_hi, s_save_ttmps_hi
331
332 // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
333 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
334 get_wave_size2(s_save_ttmps_hi)
335 get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
336 get_svgpr_size_bytes(s_save_ttmps_hi)
337 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
338 s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
339 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
340 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
341 s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0
342
343 v_writelane_b32 v0, ttmp4, 0x4
344 v_writelane_b32 v0, ttmp5, 0x5
345 v_writelane_b32 v0, ttmp6, 0x6
346 v_writelane_b32 v0, ttmp7, 0x7
347 v_writelane_b32 v0, ttmp8, 0x8
348 v_writelane_b32 v0, ttmp9, 0x9
349 v_writelane_b32 v0, ttmp10, 0xA
350 v_writelane_b32 v0, ttmp11, 0xB
351 v_writelane_b32 v0, ttmp13, 0xD
352 v_writelane_b32 v0, exec_lo, 0xE
353 v_writelane_b32 v0, exec_hi, 0xF
354
355 s_mov_b32 exec_lo, 0x3FFF
356 s_mov_b32 exec_hi, 0x0
357 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] offset:0x40 scope:SCOPE_SYS
358 v_readlane_b32 ttmp14, v0, 0xE
359 v_readlane_b32 ttmp15, v0, 0xF
360 s_mov_b32 exec_lo, ttmp14
361 s_mov_b32 exec_hi, ttmp15
362
363 /* setup Resource Contants */
364 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
365 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
366 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
367 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
368 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
369
370 s_mov_b32 s_save_m0, m0
371
372 /* global mem offset */
373 s_mov_b32 s_save_mem_offset, 0x0
374 get_wave_size2(s_wave_size)
375
376 /* save first 4 VGPRs, needed for SGPR save */
377 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
378 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
379 s_and_b32 m0, m0, 1
380 s_cmp_eq_u32 m0, 1
381 s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI
382 s_mov_b32 exec_hi, 0x00000000
383 s_branch L_SAVE_4VGPR_WAVE32
384L_ENABLE_SAVE_4VGPR_EXEC_HI:
385 s_mov_b32 exec_hi, 0xFFFFFFFF
386 s_branch L_SAVE_4VGPR_WAVE64
387L_SAVE_4VGPR_WAVE32:
388 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
389
390 // VGPR Allocated in 4-GPR granularity
391
392 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128
393 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2
394 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3
395 s_branch L_SAVE_HWREG
396
397L_SAVE_4VGPR_WAVE64:
398 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
399
400 // VGPR Allocated in 4-GPR granularity
401
402 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256
403 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2
404 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3
405
406 /* save HW registers */
407
408L_SAVE_HWREG:
409 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
410 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
411 get_svgpr_size_bytes(s_save_tmp)
412 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
413 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
414
415 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
416
417 v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource
418 v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource
419 v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store
420 s_mov_b32 m0, 0x0 //Next lane of v2 to write to
421
422 // Ensure no further changes to barrier or LDS state.
423 // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
424 s_barrier_signal -2
425 s_barrier_wait -2
426
427 // Re-read final state of BARRIER_COMPLETE field for save.
428 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV)
429 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
430 s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
431 s_or_b32 s_save_state_priv, s_save_state_priv, s_save_tmp
432
433 write_hwreg_to_v2(s_save_m0)
434 write_hwreg_to_v2(s_save_pc_lo)
435 s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
436 write_hwreg_to_v2(s_save_tmp)
437 write_hwreg_to_v2(s_save_exec_lo)
438 write_hwreg_to_v2(s_save_exec_hi)
439 write_hwreg_to_v2(s_save_state_priv)
440
441 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
442 write_hwreg_to_v2(s_save_tmp)
443
444 write_hwreg_to_v2(s_save_xnack_mask)
445
446 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_MODE)
447 write_hwreg_to_v2(s_save_m0)
448
449 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO)
450 write_hwreg_to_v2(s_save_m0)
451
452 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI)
453 write_hwreg_to_v2(s_save_m0)
454
455 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
456 write_hwreg_to_v2(s_save_m0)
457
458 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL)
459 write_hwreg_to_v2(s_save_m0)
460
461 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
462 write_hwreg_to_v2(s_save_tmp)
463
464 s_get_barrier_state s_save_tmp, -1
465 s_wait_kmcnt (0)
466 write_hwreg_to_v2(s_save_tmp)
467
468 // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
469 s_mov_b32 exec_lo, 0xFFFF
470 s_mov_b32 exec_hi, 0x0
471 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
472
473 // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
474 s_mov_b32 exec_lo, 0xFFFFFFFF
475
476 /* save SGPRs */
477 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
478
479 // SGPR SR memory offset : size(VGPR)+size(SVGPR)
480 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
481 get_svgpr_size_bytes(s_save_tmp)
482 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
483 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
484
485 s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into
486
487 s_mov_b32 m0, 0x0 //SGPR initial index value =0
488 s_nop 0x0 //Manually inserted wait states
489L_SAVE_SGPR_LOOP:
490 // SGPR is allocated in 16 SGPR granularity
491 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
492 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
493 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
494 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
495 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
496 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
497 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
498 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
499
500 write_16sgpr_to_v2(s0)
501
502 s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled?
503 s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE
504
505 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
506 s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80
507 s_mov_b32 ttmp13, 0x0
508 v_mov_b32 v2, 0x0
509L_SAVE_SGPR_SKIP_TCP_STORE:
510
511 s_add_u32 m0, m0, 16 //next sgpr index
512 s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0
513 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete?
514
515 //save the rest 12 SGPR
516 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
517 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
518 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
519 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
520 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
521 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
522 write_12sgpr_to_v2(s0)
523
524 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
525
526 /* save LDS */
527
528L_SAVE_LDS:
529 // Change EXEC to all threads...
530 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
531 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
532 s_and_b32 m0, m0, 1
533 s_cmp_eq_u32 m0, 1
534 s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI
535 s_mov_b32 exec_hi, 0x00000000
536 s_branch L_SAVE_LDS_NORMAL
537L_ENABLE_SAVE_LDS_EXEC_HI:
538 s_mov_b32 exec_hi, 0xFFFFFFFF
539L_SAVE_LDS_NORMAL:
540 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
541 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
542 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
543
544 s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
545 s_cbranch_scc0 L_SAVE_LDS_DONE
546
547 // first wave do LDS save;
548
549 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
550 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
551
552 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
553 //
554 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
555 get_svgpr_size_bytes(s_save_tmp)
556 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
557 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
558 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
559
560 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
561
562 //load 0~63*4(byte address) to vgpr v0
563 v_mbcnt_lo_u32_b32 v0, -1, 0
564 v_mbcnt_hi_u32_b32 v0, -1, v0
565 v_mul_u32_u24 v0, 4, v0
566
567 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
568 s_and_b32 m0, m0, 1
569 s_cmp_eq_u32 m0, 1
570 s_mov_b32 m0, 0x0
571 s_cbranch_scc1 L_SAVE_LDS_W64
572
573L_SAVE_LDS_W32:
574 s_mov_b32 s3, 128
575 s_nop 0
576 s_nop 0
577 s_nop 0
578L_SAVE_LDS_LOOP_W32:
579 ds_read_b32 v1, v0
580 s_wait_idle
581 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
582
583 s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes
584 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
585 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
586 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
587 s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete?
588
589 s_branch L_SAVE_LDS_DONE
590
591L_SAVE_LDS_W64:
592 s_mov_b32 s3, 256
593 s_nop 0
594 s_nop 0
595 s_nop 0
596L_SAVE_LDS_LOOP_W64:
597 ds_read_b32 v1, v0
598 s_wait_idle
599 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
600
601 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
602 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
603 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes
604 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
605 s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete?
606
607L_SAVE_LDS_DONE:
608 /* save VGPRs - set the Rest VGPRs */
609L_SAVE_VGPR:
610 // VGPR SR memory offset: 0
611 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
612 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
613 s_and_b32 m0, m0, 1
614 s_cmp_eq_u32 m0, 1
615 s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
616 s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs
617 s_mov_b32 exec_hi, 0x00000000
618 s_branch L_SAVE_VGPR_NORMAL
619L_ENABLE_SAVE_VGPR_EXEC_HI:
620 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
621 s_mov_b32 exec_hi, 0xFFFFFFFF
622L_SAVE_VGPR_NORMAL:
623 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
624 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
625 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
626 //determine it is wave32 or wave64
627 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
628 s_and_b32 m0, m0, 1
629 s_cmp_eq_u32 m0, 1
630 s_cbranch_scc1 L_SAVE_VGPR_WAVE64
631
632 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
633
634 // VGPR Allocated in 4-GPR granularity
635
636 // VGPR store using dw burst
637 s_mov_b32 m0, 0x4 //VGPR initial index value =4
638 s_cmp_lt_u32 m0, s_save_alloc_size
639 s_cbranch_scc0 L_SAVE_VGPR_END
640
641L_SAVE_VGPR_W32_LOOP:
642 v_movrels_b32 v0, v0 //v0 = v[0+m0]
643 v_movrels_b32 v1, v1 //v1 = v[1+m0]
644 v_movrels_b32 v2, v2 //v2 = v[2+m0]
645 v_movrels_b32 v3, v3 //v3 = v[3+m0]
646
647 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
648 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128
649 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2
650 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3
651
652 s_add_u32 m0, m0, 4 //next vgpr index
653 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes
654 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
655 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete?
656
657 s_branch L_SAVE_VGPR_END
658
659L_SAVE_VGPR_WAVE64:
660 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
661
662 // VGPR store using dw burst
663 s_mov_b32 m0, 0x4 //VGPR initial index value =4
664 s_cmp_lt_u32 m0, s_save_alloc_size
665 s_cbranch_scc0 L_SAVE_SHARED_VGPR
666
667L_SAVE_VGPR_W64_LOOP:
668 v_movrels_b32 v0, v0 //v0 = v[0+m0]
669 v_movrels_b32 v1, v1 //v1 = v[1+m0]
670 v_movrels_b32 v2, v2 //v2 = v[2+m0]
671 v_movrels_b32 v3, v3 //v3 = v[3+m0]
672
673 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
674 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256
675 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2
676 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3
677
678 s_add_u32 m0, m0, 4 //next vgpr index
679 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
680 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
681 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete?
682
683L_SAVE_SHARED_VGPR:
684 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
685 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
686 s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS
687 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
688 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
689 //save shared_vgpr will start from the index of m0
690 s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
691 s_mov_b32 exec_lo, 0xFFFFFFFF
692 s_mov_b32 exec_hi, 0x00000000
693
694L_SAVE_SHARED_VGPR_WAVE64_LOOP:
695 v_movrels_b32 v0, v0 //v0 = v[0+m0]
696 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
697 s_add_u32 m0, m0, 1 //next vgpr index
698 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128
699 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
700 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
701
702L_SAVE_VGPR_END:
703 s_branch L_END_PGM
704
705L_RESTORE:
706 /* Setup Resource Contants */
707 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
708 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
709 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
710 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
711 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
712
713 // Save s_restore_spi_init_hi for later use.
714 s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi
715
716 //determine it is wave32 or wave64
717 get_wave_size2(s_restore_size)
718
719 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
720 s_cbranch_scc0 L_RESTORE_VGPR
721
722 /* restore LDS */
723L_RESTORE_LDS:
724 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
725 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
726 s_and_b32 m0, m0, 1
727 s_cmp_eq_u32 m0, 1
728 s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI
729 s_mov_b32 exec_hi, 0x00000000
730 s_branch L_RESTORE_LDS_NORMAL
731L_ENABLE_RESTORE_LDS_EXEC_HI:
732 s_mov_b32 exec_hi, 0xFFFFFFFF
733L_RESTORE_LDS_NORMAL:
734 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
735 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
736 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
737 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
738 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
739
740 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
741 //
742 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
743 get_svgpr_size_bytes(s_restore_tmp)
744 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
745 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
746 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
747
748 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
749
750 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
751 s_and_b32 m0, m0, 1
752 s_cmp_eq_u32 m0, 1
753 s_mov_b32 m0, 0x0
754 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
755
756L_RESTORE_LDS_LOOP_W32:
757 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
758 s_wait_idle
759 ds_store_addtid_b32 v0
760 s_add_u32 m0, m0, 128 // 128 DW
761 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW
762 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
763 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
764 s_branch L_RESTORE_VGPR
765
766L_RESTORE_LDS_LOOP_W64:
767 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
768 s_wait_idle
769 ds_store_addtid_b32 v0
770 s_add_u32 m0, m0, 256 // 256 DW
771 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW
772 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
773 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
774
775 /* restore VGPRs */
776L_RESTORE_VGPR:
777 // VGPR SR memory offset : 0
778 s_mov_b32 s_restore_mem_offset, 0x0
779 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
780 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
781 s_and_b32 m0, m0, 1
782 s_cmp_eq_u32 m0, 1
783 s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI
784 s_mov_b32 exec_hi, 0x00000000
785 s_branch L_RESTORE_VGPR_NORMAL
786L_ENABLE_RESTORE_VGPR_EXEC_HI:
787 s_mov_b32 exec_hi, 0xFFFFFFFF
788L_RESTORE_VGPR_NORMAL:
789 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
790 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
791 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
792 //determine it is wave32 or wave64
793 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
794 s_and_b32 m0, m0, 1
795 s_cmp_eq_u32 m0, 1
796 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
797
798 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
799
800 // VGPR load using dw burst
801 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
802 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4
803 s_mov_b32 m0, 4 //VGPR initial index value = 4
804 s_cmp_lt_u32 m0, s_restore_alloc_size
805 s_cbranch_scc0 L_RESTORE_SGPR
806
807L_RESTORE_VGPR_WAVE32_LOOP:
808 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
809 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128
810 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*2
811 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*3
812 s_wait_idle
813 v_movreld_b32 v0, v0 //v[0+m0] = v0
814 v_movreld_b32 v1, v1
815 v_movreld_b32 v2, v2
816 v_movreld_b32 v3, v3
817 s_add_u32 m0, m0, 4 //next vgpr index
818 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes
819 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
820 s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
821
822 /* VGPR restore on v0 */
823 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS
824 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128
825 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*2
826 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*3
827 s_wait_idle
828
829 s_branch L_RESTORE_SGPR
830
831L_RESTORE_VGPR_WAVE64:
832 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
833
834 // VGPR load using dw burst
835 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last
836 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
837 s_mov_b32 m0, 4 //VGPR initial index value = 4
838 s_cmp_lt_u32 m0, s_restore_alloc_size
839 s_cbranch_scc0 L_RESTORE_SHARED_VGPR
840
841L_RESTORE_VGPR_WAVE64_LOOP:
842 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
843 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256
844 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*2
845 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*3
846 s_wait_idle
847 v_movreld_b32 v0, v0 //v[0+m0] = v0
848 v_movreld_b32 v1, v1
849 v_movreld_b32 v2, v2
850 v_movreld_b32 v3, v3
851 s_add_u32 m0, m0, 4 //next vgpr index
852 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
853 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
854 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
855
856L_RESTORE_SHARED_VGPR:
857 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
858 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
859 s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used?
860 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
861 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
862 //restore shared_vgpr will start from the index of m0
863 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
864 s_mov_b32 exec_lo, 0xFFFFFFFF
865 s_mov_b32 exec_hi, 0x00000000
866L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
867 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
868 s_wait_idle
869 v_movreld_b32 v0, v0 //v[0+m0] = v0
870 s_add_u32 m0, m0, 1 //next vgpr index
871 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
872 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
873 s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
874
875 s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
876
877 /* VGPR restore on v0 */
878L_RESTORE_V0:
879 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS
880 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256
881 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*2
882 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*3
883 s_wait_idle
884
885 /* restore SGPRs */
886 //will be 2+8+16*6
887 // SGPR SR memory offset : size(VGPR)+size(SVGPR)
888L_RESTORE_SGPR:
889 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
890 get_svgpr_size_bytes(s_restore_tmp)
891 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
892 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
893 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved
894
895 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
896
897 s_mov_b32 m0, s_sgpr_save_num
898
899 read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
900 s_wait_idle
901
902 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104]
903 s_nop 0 // hazard SALU M0=> S_MOVREL
904
905 s_movreld_b64 s0, s0 //s[0+m0] = s0
906 s_movreld_b64 s2, s2
907
908 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
909 s_wait_idle
910
911 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96]
912 s_nop 0 // hazard SALU M0=> S_MOVREL
913
914 s_movreld_b64 s0, s0 //s[0+m0] = s0
915 s_movreld_b64 s2, s2
916 s_movreld_b64 s4, s4
917 s_movreld_b64 s6, s6
918
919 L_RESTORE_SGPR_LOOP:
920 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
921 s_wait_idle
922
923 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
924 s_nop 0 // hazard SALU M0=> S_MOVREL
925
926 s_movreld_b64 s0, s0 //s[0+m0] = s0
927 s_movreld_b64 s2, s2
928 s_movreld_b64 s4, s4
929 s_movreld_b64 s6, s6
930 s_movreld_b64 s8, s8
931 s_movreld_b64 s10, s10
932 s_movreld_b64 s12, s12
933 s_movreld_b64 s14, s14
934
935 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0
936 s_cbranch_scc0 L_RESTORE_SGPR_LOOP
937
938 // s_barrier with STATE_PRIV.TRAP_AFTER_INST=1, STATUS.PRIV=1 incorrectly asserts debug exception.
939 // Clear DEBUG_EN before and restore MODE after the barrier.
940 s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE), 0
941
942 /* restore HW registers */
943L_RESTORE_HWREG:
944 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
945 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
946 get_svgpr_size_bytes(s_restore_tmp)
947 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
948 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
949
950 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
951
952 // Restore s_restore_spi_init_hi before the saved value gets clobbered.
953 s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save
954
955 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
956 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
957 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
958 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
959 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
960 read_hwreg_from_mem(s_restore_state_priv, s_restore_buf_rsrc0, s_restore_mem_offset)
961 read_hwreg_from_mem(s_restore_excp_flag_priv, s_restore_buf_rsrc0, s_restore_mem_offset)
962 read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
963 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
964 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
965 s_wait_idle
966
967 s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch
968
969 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
970 s_wait_idle
971
972 s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch
973
974 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
975 s_wait_idle
976 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
977
978 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
979 s_wait_idle
980 s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
981
982 // Only the first wave needs to restore the workgroup barrier.
983 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
984 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
985
986 // Skip over WAVE_STATUS, since there is no state to restore from it
987 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4
988
989 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
990 s_wait_idle
991
992 s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET
993 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
994
995 // extract the saved signal count from s_restore_tmp
996 s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
997
998 // We need to call s_barrier_signal repeatedly to restore the signal
999 // count of the work group barrier. The member count is already
1000 // initialized with the number of waves in the work group.
1001L_BARRIER_RESTORE_LOOP:
1002 s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp
1003 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
1004 s_barrier_signal -1
1005 s_add_i32 s_restore_tmp, s_restore_tmp, -1
1006 s_branch L_BARRIER_RESTORE_LOOP
1007
1008L_SKIP_BARRIER_RESTORE:
1009
1010 s_mov_b32 m0, s_restore_m0
1011 s_mov_b32 exec_lo, s_restore_exec_lo
1012 s_mov_b32 exec_hi, s_restore_exec_hi
1013
1014 // EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed.
1015 // Only restore the other fields to avoid clobbering them.
1016 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv
1017 s_lshr_b32 s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT
1018 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE), s_restore_excp_flag_priv
1019 s_lshr_b32 s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT
1020 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE), s_restore_excp_flag_priv
1021
1022 s_setreg_b32 hwreg(HW_REG_WAVE_MODE), s_restore_mode
1023
1024 // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
1025 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
1026 get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
1027 get_svgpr_size_bytes(s_restore_ttmps_hi)
1028 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1029 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
1030 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1031 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1032 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1033 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 scope:SCOPE_SYS
1034 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 scope:SCOPE_SYS
1035 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 scope:SCOPE_SYS
1036 s_wait_idle
1037
1038 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
1039 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
1040 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
1041
1042 s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu
1043
1044 // Make barrier and LDS state visible to all waves in the group.
1045 // STATE_PRIV.BARRIER_COMPLETE may change after this point.
1046 s_barrier_signal -2
1047 s_barrier_wait -2
1048
1049 s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
1050
1051L_END_PGM:
1052 // Make sure that no wave of the workgroup can exit the trap handler
1053 // before the workgroup barrier state is saved.
1054 s_barrier_signal -2
1055 s_barrier_wait -2
1056 s_endpgm_saved
1057end
1058
1059function write_hwreg_to_v2(s)
1060 // Copy into VGPR for later TCP store.
1061 v_writelane_b32 v2, s, m0
1062 s_add_u32 m0, m0, 0x1
1063end
1064
1065
1066function write_16sgpr_to_v2(s)
1067 // Copy into VGPR for later TCP store.
1068 for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
1069 v_writelane_b32 v2, s[sgpr_idx], ttmp13
1070 s_add_u32 ttmp13, ttmp13, 0x1
1071 end
1072end
1073
1074function write_12sgpr_to_v2(s)
1075 // Copy into VGPR for later TCP store.
1076 for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
1077 v_writelane_b32 v2, s[sgpr_idx], ttmp13
1078 s_add_u32 ttmp13, ttmp13, 0x1
1079 end
1080end
1081
1082function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1083 s_buffer_load_dword s, s_rsrc, s_mem_offset scope:SCOPE_SYS
1084 s_add_u32 s_mem_offset, s_mem_offset, 4
1085end
1086
1087function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1088 s_sub_u32 s_mem_offset, s_mem_offset, 4*16
1089 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
1090end
1091
1092function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
1093 s_sub_u32 s_mem_offset, s_mem_offset, 4*8
1094 s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
1095end
1096
1097function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
1098 s_sub_u32 s_mem_offset, s_mem_offset, 4*4
1099 s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
1100end
1101
1102function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
1103 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1104 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
1105 s_bitcmp1_b32 s_size, S_WAVE_SIZE
1106 s_cbranch_scc1 L_ENABLE_SHIFT_W64
1107 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value)
1108 s_branch L_SHIFT_DONE
1109L_ENABLE_SHIFT_W64:
1110 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value)
1111L_SHIFT_DONE:
1112end
1113
1114function get_svgpr_size_bytes(s_svgpr_size_byte)
1115 s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1116 s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
1117end
1118
1119function get_sgpr_size_bytes
1120 return 512
1121end
1122
1123function get_hwreg_size_bytes
1124 return 128
1125end
1126
1127function get_wave_size2(s_reg)
1128 s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE)
1129 s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE
1130end