]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux.git] / drivers / gpu / drm / amd / amdkfd / cwsr_trap_handler_gfx10.asm
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22
23
24 shader main
25
26 asic(DEFAULT)
27
28 type(CS)
29
30 wave_size(32)
31 /*************************************************************************/
32 /*                                      control on how to run the shader                                         */
33 /*************************************************************************/
34 //any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
35 var EMU_RUN_HACK                                        =       0
36 var EMU_RUN_HACK_RESTORE_NORMAL         =       0
37 var EMU_RUN_HACK_SAVE_NORMAL_EXIT       =       0
38 var     EMU_RUN_HACK_SAVE_SINGLE_WAVE   =       0
39 var EMU_RUN_HACK_SAVE_FIRST_TIME        =       0                                       //for interrupted restore in which the first save is through EMU_RUN_HACK
40 var SAVE_LDS                                            =       0
41 var WG_BASE_ADDR_LO                                     =   0x9000a000
42 var WG_BASE_ADDR_HI                                     =       0x0
43 var WAVE_SPACE                                          =       0x9000                          //memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved
44 var CTX_SAVE_CONTROL                            =       0x0
45 var CTX_RESTORE_CONTROL                         =       CTX_SAVE_CONTROL
46 var SIM_RUN_HACK                                        =       0                                       //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
47 var     SGPR_SAVE_USE_SQC                               =       0                                       //use SQC D$ to do the write
48 var USE_MTBUF_INSTEAD_OF_MUBUF          =       0                                       //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
49 var SWIZZLE_EN                                          =       0                                       //whether we use swizzled buffer addressing
50 var SAVE_RESTORE_HWID_DDID          =   0
51 var RESTORE_DDID_IN_SGPR18          =   0
52 /**************************************************************************/
53 /*                      variables                                                                     */
54 /**************************************************************************/
55 var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
56 var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
57 var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
58
59 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
60 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE             = 9
61 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
62 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
63 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
64 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 4                                             //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
65 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT    = 24
66 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE     = 4
67 var SQ_WAVE_IB_STS2_WAVE64_SHIFT        = 11
68 var SQ_WAVE_IB_STS2_WAVE64_SIZE         = 1
69
70 var     SQ_WAVE_TRAPSTS_SAVECTX_MASK    =       0x400
71 var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF                           // Exception mask
72 var     SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =       10                                      
73 var     SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =       0x100                                   
74 var     SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =       8               
75 var     SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK        =       0x3FF
76 var     SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT       =       0x0
77 var     SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE        =       10
78 var     SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK       =       0xFFFFF800      
79 var     SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT      =       11
80 var     SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE       =       21      
81
82 var SQ_WAVE_IB_STS_RCNT_SHIFT                   =       16                                      //FIXME
83 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =       15                                      //FIXME
84 var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
85 var SQ_WAVE_IB_STS_RCNT_SIZE            =   6                   //FIXME
86 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF    //FIXME
87  
88 var     SQ_BUF_RSRC_WORD1_ATC_SHIFT             =       24
89 var     SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =       27
90
91
92 /*      Save        */
93 var     S_SAVE_BUF_RSRC_WORD1_STRIDE            =       0x00040000              //stride is 4 bytes 
94 var     S_SAVE_BUF_RSRC_WORD3_MISC                      =       0x00807FAC                      //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE                        
95
96 var     S_SAVE_SPI_INIT_ATC_MASK                        =       0x08000000                      //bit[27]: ATC bit
97 var     S_SAVE_SPI_INIT_ATC_SHIFT                       =       27
98 var     S_SAVE_SPI_INIT_MTYPE_MASK                      =       0x70000000                      //bit[30:28]: Mtype
99 var     S_SAVE_SPI_INIT_MTYPE_SHIFT                     =       28
100 var     S_SAVE_SPI_INIT_FIRST_WAVE_MASK         =       0x04000000                      //bit[26]: FirstWaveInTG
101 var     S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT        =       26
102
103 var S_SAVE_PC_HI_RCNT_SHIFT                             =       28                                      //FIXME  check with Brian to ensure all fields other than PC[47:0] can be used
104 var S_SAVE_PC_HI_RCNT_MASK                              =   0xF0000000                  //FIXME
105 var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT             =       27                                      //FIXME
106 var S_SAVE_PC_HI_FIRST_REPLAY_MASK              =       0x08000000                      //FIXME
107
108 var     s_save_spi_init_lo                              =       exec_lo
109 var s_save_spi_init_hi                          =       exec_hi
110
111 var     s_save_pc_lo                    =       ttmp0                   //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
112 var     s_save_pc_hi                    =       ttmp1                   
113 var s_save_exec_lo                      =       ttmp2
114 var s_save_exec_hi                      =       ttmp3                   
115 var     s_save_status                   =       ttmp4                   
116 var     s_save_trapsts                  =       ttmp5                   //not really used until the end of the SAVE routine
117 var s_wave_size                 =       ttmp6           //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK
118 var s_save_xnack_mask       =   ttmp7
119 var     s_save_buf_rsrc0                =       ttmp8
120 var     s_save_buf_rsrc1                =       ttmp9
121 var     s_save_buf_rsrc2                =       ttmp10
122 var     s_save_buf_rsrc3                =       ttmp11
123
124 var s_save_mem_offset           =       ttmp14
125 var s_sgpr_save_num         =   106                     //in gfx10, all sgpr must be saved
126 var s_save_alloc_size           =       s_save_trapsts                  //conflict
127 var s_save_tmp              =   s_save_buf_rsrc2        //shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
128 var s_save_m0                           =       ttmp15                                  
129
130 /*      Restore     */
131 var     S_RESTORE_BUF_RSRC_WORD1_STRIDE                 =       S_SAVE_BUF_RSRC_WORD1_STRIDE 
132 var     S_RESTORE_BUF_RSRC_WORD3_MISC                   =       S_SAVE_BUF_RSRC_WORD3_MISC               
133
134 var     S_RESTORE_SPI_INIT_ATC_MASK                         =   0x08000000                      //bit[27]: ATC bit
135 var     S_RESTORE_SPI_INIT_ATC_SHIFT                    =       27
136 var     S_RESTORE_SPI_INIT_MTYPE_MASK                   =       0x70000000                      //bit[30:28]: Mtype
137 var     S_RESTORE_SPI_INIT_MTYPE_SHIFT                  =       28
138 var     S_RESTORE_SPI_INIT_FIRST_WAVE_MASK              =       0x04000000                      //bit[26]: FirstWaveInTG
139 var     S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT         =   26
140
141 var S_RESTORE_PC_HI_RCNT_SHIFT                          =       S_SAVE_PC_HI_RCNT_SHIFT
142 var S_RESTORE_PC_HI_RCNT_MASK                           =   S_SAVE_PC_HI_RCNT_MASK
143 var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT          =       S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
144 var S_RESTORE_PC_HI_FIRST_REPLAY_MASK           =       S_SAVE_PC_HI_FIRST_REPLAY_MASK
145
146 var s_restore_spi_init_lo                   =   exec_lo
147 var s_restore_spi_init_hi                   =   exec_hi
148
149 var s_restore_mem_offset                =       ttmp12
150 var s_restore_alloc_size                =       ttmp3
151 var s_restore_tmp               =   ttmp6
152 var s_restore_mem_offset_save   =       s_restore_tmp           //no conflict
153
154 var s_restore_m0                        =       s_restore_alloc_size    //no conflict                   
155
156 var s_restore_mode                      =       ttmp13
157 var s_restore_hwid1         =  ttmp2
158 var s_restore_ddid          =  s_restore_hwid1
159 var     s_restore_pc_lo             =   ttmp0                   
160 var     s_restore_pc_hi             =   ttmp1
161 var s_restore_exec_lo           =       ttmp14
162 var s_restore_exec_hi           =       ttmp15
163 var     s_restore_status            =   ttmp4                   
164 var     s_restore_trapsts           =   ttmp5
165 //var s_restore_xnack_mask_lo   =       xnack_mask_lo
166 //var s_restore_xnack_mask_hi   =       xnack_mask_hi
167 var s_restore_xnack_mask    =   ttmp7
168 var     s_restore_buf_rsrc0             =       ttmp8
169 var     s_restore_buf_rsrc1             =       ttmp9
170 var     s_restore_buf_rsrc2             =       ttmp10
171 var     s_restore_buf_rsrc3             =       ttmp11
172 var s_restore_size              =       ttmp13                  //ttmp13 has no conflict
173
174 /**************************************************************************/
175 /*                      trap handler entry points                                     */
176 /**************************************************************************/
177     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))                                       //hack to use trap_id for determining save/restore
178                 //FIXME VCCZ un-init assertion s_getreg_b32     s_save_status, hwreg(HW_REG_STATUS)                     //save STATUS since we will change SCC
179                 s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000                          //change SCC
180         s_cmp_eq_u32 s_save_tmp, 0x007e0000                                             //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.  
181         s_cbranch_scc0 L_JUMP_TO_RESTORE                                                        //do not need to recover STATUS here  since we are going to RESTORE
182                 //FIXME  s_setreg_b32   hwreg(HW_REG_STATUS),   s_save_status           //need to recover STATUS since we are going to SAVE     
183                 s_branch L_SKIP_RESTORE                                                                         //NOT restore, SAVE actually
184         else    
185                 s_branch L_SKIP_RESTORE                                                                         //NOT restore. might be a regular trap or save
186     end
187
188 L_JUMP_TO_RESTORE:
189     s_branch L_RESTORE                                                                                          //restore
190
191 L_SKIP_RESTORE:
192         
193         s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                                                             //save STATUS since we will change SCC
194     s_andn2_b32         s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
195         s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)                                                   
196         s_and_b32               s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save  
197         s_cbranch_scc1  L_SAVE                                                                                                                  //this is the operation for save
198
199     // *********    Handle non-CWSR traps       *******************
200     if (!EMU_RUN_HACK)
201                 s_getreg_b32     s_save_trapsts, hwreg(HW_REG_TRAPSTS)
202                 s_and_b32        s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
203                 s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
204                 s_add_u32    ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0 
205                 
206                 L_EXCP_CASE:
207                 s_and_b32    ttmp1, ttmp1, 0xFFFF
208                 s_rfe_b64    [ttmp0, ttmp1]
209         end
210     // *********        End handling of non-CWSR traps   *******************
211
212 /**************************************************************************/
213 /*                      save routine                                                          */
214 /**************************************************************************/
215
216 L_SAVE: 
217         
218         //check whether there is mem_viol
219         s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)                                                   
220         s_and_b32               s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK                   
221         s_cbranch_scc0  L_NO_PC_REWIND
222     
223         //if so, need rewind PC assuming GDS operation gets NACKed
224         s_mov_b32       s_save_tmp, 0                                                                                                                   //clear mem_viol bit
225         s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit 
226         s_and_b32               s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
227         s_sub_u32               s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
228         s_subb_u32              s_save_pc_hi, s_save_pc_hi, 0x0                   // -scc
229
230 L_NO_PC_REWIND:
231     s_mov_b32       s_save_tmp, 0                                                                                                                       //clear saveCtx bit
232         s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp             //clear saveCtx bit   
233
234         //s_mov_b32             s_save_xnack_mask_lo,   xnack_mask_lo                                                                   //save XNACK_MASK  
235         //s_mov_b32             s_save_xnack_mask_hi,   xnack_mask_hi
236     s_getreg_b32        s_save_xnack_mask,  hwreg(HW_REG_SHADER_XNACK_MASK)  
237         s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)                                   //save RCNT
238         s_lshl_b32              s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
239         s_or_b32                s_save_pc_hi, s_save_pc_hi, s_save_tmp
240         s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
241         s_lshl_b32              s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
242         s_or_b32                s_save_pc_hi, s_save_pc_hi, s_save_tmp
243         s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                                                                                //clear RCNT and FIRST_REPLAY in IB_STS
244         s_and_b32               s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
245
246         s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
247     
248         /*              inform SPI the readiness and wait for SPI's go signal */
249         s_mov_b32               s_save_exec_lo, exec_lo                                                                                                 //save EXEC and use EXEC for the go signal from SPI
250         s_mov_b32               s_save_exec_hi, exec_hi
251         s_mov_b64               exec,   0x0                                                                                                                             //clear EXEC to get ready to receive
252         if (EMU_RUN_HACK)
253         
254         else
255                 s_sendmsg       sendmsg(MSG_SAVEWAVE)                                                                                                   //send SPI a message and wait for SPI's write to EXEC  
256         end
257
258   L_SLEEP:              
259         s_sleep 0x2
260         
261         if (EMU_RUN_HACK)
262                                                                                                                                                                                         
263         else
264                 s_cbranch_execz L_SLEEP                                                         
265         end
266
267
268         /*      setup Resource Contants    */
269         if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) 
270                 //calculate wd_addr using absolute thread id 
271                 v_readlane_b32 s_save_tmp, v9, 0
272         //determine it is wave32 or wave64
273         s_getreg_b32    s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
274         s_cmp_eq_u32    s_wave_size, 0
275         s_cbranch_scc1  L_SAVE_WAVE32
276         s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64
277         s_branch    L_SAVE_CON
278     L_SAVE_WAVE32:
279         s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32
280     L_SAVE_CON:
281                 s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
282                 s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
283                 s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
284                 s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL              
285         else
286         end
287         if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
288                 s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
289                 s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
290                 s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL              
291         else
292         end
293         
294         
295         s_mov_b32               s_save_buf_rsrc0,       s_save_spi_init_lo                                                                                                              //base_addr_lo
296         s_and_b32               s_save_buf_rsrc1,       s_save_spi_init_hi, 0x0000FFFF                                                                                  //base_addr_hi
297         s_or_b32                s_save_buf_rsrc1,       s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
298     s_mov_b32       s_save_buf_rsrc2,   0                                                                                               //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
299         s_mov_b32               s_save_buf_rsrc3,       S_SAVE_BUF_RSRC_WORD3_MISC
300         s_and_b32               s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK                
301         s_lshr_b32              s_save_tmp,             s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)                     //get ATC bit into position
302         s_or_b32                s_save_buf_rsrc3,       s_save_buf_rsrc3,  s_save_tmp                                                                                   //or ATC
303         s_and_b32               s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK              
304         s_lshr_b32              s_save_tmp,             s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)         //get MTYPE bits into position
305         s_or_b32                s_save_buf_rsrc3,       s_save_buf_rsrc3,  s_save_tmp                                                                                   //or MTYPE      
306         
307         s_mov_b32               s_save_m0,                      m0                                                                                                                                      //save M0
308         
309         /*              global mem offset                       */
310         s_mov_b32               s_save_mem_offset,      0x0                                                                                                                                             //mem offset initial value = 0
311     s_getreg_b32        s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size
312     s_or_b32        s_wave_size, s_save_spi_init_hi,    s_wave_size                                             //share s_wave_size with exec_hi
313
314     /*          save VGPRs          */
315         //////////////////////////////
316   L_SAVE_VGPR:
317   
318         s_mov_b32               exec_lo, 0xFFFFFFFF                                                                                     //need every thread from now on
319     s_and_b32       m0, s_wave_size, 1
320     s_cmp_eq_u32    m0, 1  
321     s_cbranch_scc1  L_ENABLE_SAVE_VGPR_EXEC_HI   
322     s_mov_b32           exec_hi, 0x00000000
323     s_branch        L_SAVE_VGPR_NORMAL
324   L_ENABLE_SAVE_VGPR_EXEC_HI:
325         s_mov_b32               exec_hi, 0xFFFFFFFF
326   L_SAVE_VGPR_NORMAL:   
327         s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)                                   //vpgr_size
328         //for wave32 and wave64, the num of vgpr function is the same?
329     s_add_u32           s_save_alloc_size, s_save_alloc_size, 1
330         s_lshl_b32              s_save_alloc_size, s_save_alloc_size, 2                                                 //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
331     //determine it is wave32 or wave64
332     s_and_b32       m0, s_wave_size, 1
333     s_cmp_eq_u32    m0, 1
334     s_cbranch_scc1  L_SAVE_VGPR_WAVE64
335
336     //zhenxu added it for save vgpr for wave32
337         s_lshl_b32              s_save_buf_rsrc2,  s_save_alloc_size, 7                                                 //NUM_RECORDS in bytes (32 threads*4)
338         if (SWIZZLE_EN)
339                 s_add_u32               s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                                         //FIXME need to use swizzle to enable bounds checking?
340         else
341                 s_mov_b32               s_save_buf_rsrc2,  0x1000000                                                            //NUM_RECORDS in bytes
342         end
343         
344     s_mov_b32           m0, 0x0                                                                                                                 //VGPR initial index value =0
345         //s_set_gpr_idx_on  m0, 0x1                                                                                                             //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
346     //s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000                                    //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
347
348   L_SAVE_VGPR_WAVE32_LOOP:                                                                              
349         v_movrels_b32           v0, v0                                                                                                                  //v0 = v[0+m0]  
350             
351     if(USE_MTBUF_INSTEAD_OF_MUBUF)       
352                 tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
353     else
354                 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
355         end
356
357     s_add_u32           m0, m0, 1                                                                                                               //next vgpr index
358         s_add_u32               s_save_mem_offset, s_save_mem_offset, 128                                               //every buffer_store_dword does 128 bytes
359         s_cmp_lt_u32    m0,     s_save_alloc_size                                                                                       //scc = (m0 < s_save_alloc_size) ? 1 : 0
360         s_cbranch_scc1  L_SAVE_VGPR_WAVE32_LOOP                                                                                         //VGPR save is complete?
361     s_branch    L_SAVE_LDS
362     //save vgpr for wave32 ends
363
364   L_SAVE_VGPR_WAVE64:
365         s_lshl_b32              s_save_buf_rsrc2,  s_save_alloc_size, 8                                                 //NUM_RECORDS in bytes (64 threads*4)
366         if (SWIZZLE_EN)
367                 s_add_u32               s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                                         //FIXME need to use swizzle to enable bounds checking?
368         else
369                 s_mov_b32               s_save_buf_rsrc2,  0x1000000                                                            //NUM_RECORDS in bytes
370         end
371         
372     s_mov_b32           m0, 0x0                                                                                                                 //VGPR initial index value =0
373         //s_set_gpr_idx_on  m0, 0x1                                                                                                             //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
374     //s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000                                    //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
375
376   L_SAVE_VGPR_WAVE64_LOOP:                                                                              
377         v_movrels_b32           v0, v0                                                                                                                  //v0 = v[0+m0]  
378             
379     if(USE_MTBUF_INSTEAD_OF_MUBUF)       
380                 tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
381     else
382                 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
383         end
384
385     s_add_u32           m0, m0, 1                                                                                                               //next vgpr index
386         s_add_u32               s_save_mem_offset, s_save_mem_offset, 256                                               //every buffer_store_dword does 256 bytes
387         s_cmp_lt_u32    m0,     s_save_alloc_size                                                                                       //scc = (m0 < s_save_alloc_size) ? 1 : 0
388         s_cbranch_scc1  L_SAVE_VGPR_WAVE64_LOOP                                                                                         //VGPR save is complete?
389         //s_set_gpr_idx_off
390     //
391     //Below part will be the save shared vgpr part (new for gfx10)
392     s_getreg_b32        s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)                     //shared_vgpr_size
393     s_and_b32           s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                                //shared_vgpr_size is zero?
394     s_cbranch_scc0      L_SAVE_LDS                                                                                                          //no shared_vgpr used? jump to L_SAVE_LDS
395     s_lshl_b32          s_save_alloc_size, s_save_alloc_size, 3                                                 //Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
396     //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
397     //save shared_vgpr will start from the index of m0
398     s_add_u32       s_save_alloc_size, s_save_alloc_size, m0
399     s_mov_b32           exec_lo, 0xFFFFFFFF
400     s_mov_b32           exec_hi, 0x00000000
401     L_SAVE_SHARED_VGPR_WAVE64_LOOP:                                                                             
402         v_movrels_b32           v0, v0                                                                                                                  //v0 = v[0+m0]  
403         buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
404     s_add_u32           m0, m0, 1                                                                                                               //next vgpr index
405         s_add_u32               s_save_mem_offset, s_save_mem_offset, 128                                               //every buffer_store_dword does 256 bytes
406         s_cmp_lt_u32    m0,     s_save_alloc_size                                                                                       //scc = (m0 < s_save_alloc_size) ? 1 : 0
407         s_cbranch_scc1  L_SAVE_SHARED_VGPR_WAVE64_LOOP                                                                  //SHARED_VGPR save is complete?
408     
409         /*              save LDS            */
410         //////////////////////////////
411   L_SAVE_LDS:
412
413     //Only check the first wave need LDS
414         /*      the first wave in the threadgroup    */
415         s_barrier                                                                                                                                               //FIXME  not performance-optimal "LDS is used? wait for other waves in the same TG" 
416         s_and_b32               s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                                                                //exec is still used here
417         s_cbranch_scc0  L_SAVE_SGPR
418         
419         s_mov_b32               exec_lo, 0xFFFFFFFF                                                                                     //need every thread from now on
420     s_and_b32       m0, s_wave_size, 1
421     s_cmp_eq_u32    m0, 1
422     s_cbranch_scc1  L_ENABLE_SAVE_LDS_EXEC_HI   
423     s_mov_b32           exec_hi, 0x00000000
424     s_branch        L_SAVE_LDS_NORMAL
425   L_ENABLE_SAVE_LDS_EXEC_HI:
426         s_mov_b32               exec_hi, 0xFFFFFFFF
427   L_SAVE_LDS_NORMAL:    
428         s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)                     //lds_size
429         s_and_b32               s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                                //lds_size is zero?
430         s_cbranch_scc0  L_SAVE_SGPR                                                                                                             //no lds used? jump to L_SAVE_VGPR
431         s_lshl_b32              s_save_alloc_size, s_save_alloc_size, 6                                                 //LDS size in dwords = lds_size * 64dw
432         s_lshl_b32              s_save_alloc_size, s_save_alloc_size, 2                                                 //LDS size in bytes
433         s_mov_b32               s_save_buf_rsrc2,  s_save_alloc_size                                                    //NUM_RECORDS in bytes
434         if (SWIZZLE_EN)
435                 s_add_u32               s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                                         //FIXME need to use swizzle to enable bounds checking?
436         else
437                 s_mov_b32               s_save_buf_rsrc2,  0x1000000                                                            //NUM_RECORDS in bytes
438         end
439
440     //load 0~63*4(byte address) to vgpr v15
441     v_mbcnt_lo_u32_b32 v0, -1, 0
442     v_mbcnt_hi_u32_b32 v0, -1, v0
443     v_mul_u32_u24 v0, 4, v0
444
445     s_and_b32       m0, s_wave_size, 1
446     s_cmp_eq_u32    m0, 1
447     s_mov_b32           m0, 0x0
448     s_cbranch_scc1  L_SAVE_LDS_LOOP_W64
449
450   L_SAVE_LDS_LOOP_W32:                                                                  
451         if (SAVE_LDS)
452     ds_read_b32 v1, v0
453     s_waitcnt 0                                                                                                             //ensure data ready
454     buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
455         //buffer_store_lds_dword        s_save_buf_rsrc0, s_save_mem_offset lds:1               //save lds to memory doesn't exist in 10
456         end
457         s_add_u32               m0, m0, 128                                                                                                                     //every buffer_store_lds does 128 bytes
458         s_add_u32               s_save_mem_offset, s_save_mem_offset, 128                                                       //mem offset increased by 128 bytes
459     v_add_nc_u32    v0, v0, 128
460         s_cmp_lt_u32    m0, s_save_alloc_size                                                                                           //scc=(m0 < s_save_alloc_size) ? 1 : 0
461         s_cbranch_scc1  L_SAVE_LDS_LOOP_W32                                                                                                     //LDS save is complete?
462     s_branch        L_SAVE_SGPR
463
464   L_SAVE_LDS_LOOP_W64:                                                                  
465         if (SAVE_LDS)
466     ds_read_b32 v1, v0
467     s_waitcnt 0                                                                                                             //ensure data ready
468     buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
469         //buffer_store_lds_dword        s_save_buf_rsrc0, s_save_mem_offset lds:1               //save lds to memory doesn't exist in 10
470         end
471         s_add_u32               m0, m0, 256                                                                                                                     //every buffer_store_lds does 256 bytes
472         s_add_u32               s_save_mem_offset, s_save_mem_offset, 256                                                       //mem offset increased by 256 bytes
473     v_add_nc_u32    v0, v0, 256
474         s_cmp_lt_u32    m0, s_save_alloc_size                                                                                           //scc=(m0 < s_save_alloc_size) ? 1 : 0
475         s_cbranch_scc1  L_SAVE_LDS_LOOP_W64                                                                                                     //LDS save is complete?
476    
477         
478         /*              save SGPRs          */
479         //////////////////////////////
480         //s_getreg_b32  s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                           //spgr_size
481         //s_add_u32             s_save_alloc_size, s_save_alloc_size, 1
482         //s_lshl_b32            s_save_alloc_size, s_save_alloc_size, 4                                                 //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value) 
483         //s_lshl_b32            s_save_alloc_size, s_save_alloc_size, 3                                                 //In gfx10, Number of SGPRs = (sgpr_size + 1) * 8   (non-zero value) 
484   L_SAVE_SGPR:
485     //need to look at it is wave32 or wave64
486     s_and_b32       m0, s_wave_size, 1
487     s_cmp_eq_u32    m0, 1
488     s_cbranch_scc1  L_SAVE_SGPR_VMEM_WAVE64
489     if (SGPR_SAVE_USE_SQC)
490                 s_lshl_b32              s_save_buf_rsrc2,       s_sgpr_save_num, 2                                      //NUM_RECORDS in bytes
491     else
492         s_lshl_b32              s_save_buf_rsrc2,       s_sgpr_save_num, 7                                      //NUM_RECORDS in bytes (32 threads)
493     end
494     s_branch    L_SAVE_SGPR_CONT    
495   L_SAVE_SGPR_VMEM_WAVE64:
496         if (SGPR_SAVE_USE_SQC)
497                 s_lshl_b32              s_save_buf_rsrc2,       s_sgpr_save_num, 2                                      //NUM_RECORDS in bytes 
498         else
499                 s_lshl_b32              s_save_buf_rsrc2,       s_sgpr_save_num, 8                                      //NUM_RECORDS in bytes (64 threads)
500         end
501   L_SAVE_SGPR_CONT:
502         if (SWIZZLE_EN)
503                 s_add_u32               s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                                         //FIXME need to use swizzle to enable bounds checking?
504         else
505                 s_mov_b32               s_save_buf_rsrc2,  0x1000000                                                            //NUM_RECORDS in bytes
506         end
507         
508         //s_mov_b32             m0, 0x0                                                                                                                 //SGPR initial index value =0           
509     //s_nop           0x0                                                             //Manually inserted wait states
510         
511     s_and_b32       m0, s_wave_size, 1
512     s_cmp_eq_u32    m0, 1
513     
514     s_mov_b32           m0, 0x0                                                                                                                 //SGPR initial index value =0           
515     s_nop           0x0                                                             //Manually inserted wait states
516
517     s_cbranch_scc1  L_SAVE_SGPR_LOOP_WAVE64
518
519   L_SAVE_SGPR_LOOP_WAVE32:                                                                              
520         s_movrels_b32   s0, s0                                                                                                                  //s0 = s[0+m0]
521     //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change    
522         write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                                                        //PV: the best performance should be using s_buffer_store_dwordx4
523         s_add_u32               m0, m0, 1                                                                                                               //next sgpr index
524         s_cmp_lt_u32    m0, s_sgpr_save_num                                                                                     //scc = (m0 < s_sgpr_save_num) ? 1 : 0
525         s_cbranch_scc1  L_SAVE_SGPR_LOOP_WAVE32                                                                                         //SGPR save is complete?
526     s_branch    L_SAVE_HWREG
527
528   L_SAVE_SGPR_LOOP_WAVE64:                                                                              
529         s_movrels_b32   s0, s0                                                                                                                  //s0 = s[0+m0]
530     //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change    
531         write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                                                        //PV: the best performance should be using s_buffer_store_dwordx4
532         s_add_u32               m0, m0, 1                                                                                                               //next sgpr index
533         s_cmp_lt_u32    m0, s_sgpr_save_num                                                                                     //scc = (m0 < s_sgpr_save_num) ? 1 : 0
534         s_cbranch_scc1  L_SAVE_SGPR_LOOP_WAVE64                                                                                         //SGPR save is complete?
535
536         
537         /*              save HW registers       */
538         //////////////////////////////
539   L_SAVE_HWREG:
540     s_mov_b32           s_save_buf_rsrc2, 0x4                                                           //NUM_RECORDS   in bytes
541         if (SWIZZLE_EN)
542                 s_add_u32               s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                                         //FIXME need to use swizzle to enable bounds checking?
543         else
544                 s_mov_b32               s_save_buf_rsrc2,  0x1000000                                                            //NUM_RECORDS in bytes
545         end
546
547     s_and_b32       m0, s_wave_size, 1
548     s_cmp_eq_u32    m0, 1
549     s_cbranch_scc1  L_SAVE_HWREG_WAVE64
550         
551         write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                                 //M0
552
553         if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))      
554                 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
555                 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0                      //carry bit over
556         end
557
558         write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                                      //PC
559         write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
560         write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                            //EXEC
561         write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
562         write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                             //STATUS 
563         
564         //s_save_trapsts conflicts with s_save_alloc_size
565         s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
566         write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                            //TRAPSTS
567         
568         //write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                    //XNACK_MASK_LO
569         write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                 //XNACK_MASK_HI
570         
571         //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
572         s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                                                                                                                                           //MODE
573         write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
574     if(SAVE_RESTORE_HWID_DDID)
575     s_getreg_b32        s_save_m0, hwreg(HW_REG_HW_ID1)                                                                                                                                                                         //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
576     write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
577     end
578     s_branch   L_S_PGM_END_SAVED
579
580   L_SAVE_HWREG_WAVE64:
581     write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                                     //M0
582
583         if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))      
584                 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
585                 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0                      //carry bit over
586         end
587
588         write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                                      //PC
589         write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
590         write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                            //EXEC
591         write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
592         write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                             //STATUS 
593         
594         //s_save_trapsts conflicts with s_save_alloc_size
595         s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
596         write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                            //TRAPSTS
597         
598         //write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                    //XNACK_MASK_LO
599         write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)                 //XNACK_MASK_HI
600         
601         //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
602         s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                                                                                                                                           //MODE
603         write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
604
605
606     if(SAVE_RESTORE_HWID_DDID)
607     s_getreg_b32        s_save_m0, hwreg(HW_REG_HW_ID1)                                                                                                                                                                         //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
608     write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
609
610         /*              save DDID       */
611         //////////////////////////////
612   L_SAVE_DDID:
613     //EXEC has been saved, no vector inst following
614     s_mov_b32   exec_lo, 0x80000000    //Set MSB to 1. Cleared when draw index is returned
615     s_sendmsg sendmsg(MSG_GET_DDID)
616
617   L_WAIT_DDID_LOOP:    
618     s_nop               7                       // sleep a bit
619     s_bitcmp0_b32 exec_lo, 31   // test to see if MSB is cleared, meaning done
620     s_cbranch_scc0      L_WAIT_DDID_LOOP
621
622     s_mov_b32   s_save_m0, exec_lo
623
624
625     s_mov_b32           s_save_buf_rsrc2, 0x4                                                           //NUM_RECORDS   in bytes
626         if (SWIZZLE_EN)
627                 s_add_u32               s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                                         //FIXME need to use swizzle to enable bounds checking?
628         else
629                 s_mov_b32               s_save_buf_rsrc2,  0x1000000                                                            //NUM_RECORDS in bytes
630         end
631     s_and_b32       m0, s_wave_size, 1
632     s_cmp_eq_u32    m0, 1
633     s_cbranch_scc1  L_SAVE_DDID_WAVE64
634
635     write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) 
636
637   L_SAVE_DDID_WAVE64:
638     write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) 
639
640     end
641    
642   L_S_PGM_END_SAVED:
643         /*     S_PGM_END_SAVED  */                                                      //FIXME  graphics ONLY
644         if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) 
645                 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
646                 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
647                 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0                      //carry bit over
648                 s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
649         else
650         end
651
652         
653     s_branch    L_END_PGM
654         
655
656                                 
657 /**************************************************************************/
658 /*                      restore routine                                                       */
659 /**************************************************************************/
660
661 L_RESTORE:
662     /*      Setup Resource Contants    */
663     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
664                 //calculate wd_addr using absolute thread id
665                 v_readlane_b32 s_restore_tmp, v9, 0
666         //determine it is wave32 or wave64
667         s_getreg_b32    s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13
668         s_cmp_eq_u32    s_restore_size, 0
669         s_cbranch_scc1  L_RESTORE_WAVE32
670         s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64
671         s_branch    L_RESTORE_CON
672     L_RESTORE_WAVE32:
673         s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32
674     L_RESTORE_CON:
675                 s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
676                 s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
677                 s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
678                 s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL     
679         else
680         end
681         
682     s_mov_b32           s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                                                                                   //base_addr_lo
683         s_and_b32               s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                                                                               //base_addr_hi
684         s_or_b32                s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
685     s_mov_b32       s_restore_buf_rsrc2,        0                                                                                                               //NUM_RECORDS initial value = 0 (in bytes)
686         s_mov_b32               s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
687         s_and_b32               s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK              
688         s_lshr_b32              s_restore_tmp,                  s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)               //get ATC bit into position
689         s_or_b32                s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                                                                             //or ATC
690         s_and_b32               s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK            
691         s_lshr_b32              s_restore_tmp,                  s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
692         s_or_b32                s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                                                                             //or MTYPE
693     //determine it is wave32 or wave64
694     s_getreg_b32        s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
695     s_or_b32        s_restore_size, s_restore_spi_init_hi,    s_restore_size                                             //share s_wave_size with exec_hi
696         
697         /*              global mem offset                       */
698         s_mov_b32               s_restore_mem_offset, 0x0                                                               //mem offset initial value = 0
699
700         /*              restore VGPRs       */
701         //////////////////////////////
702   L_RESTORE_VGPR:
703   
704         s_mov_b32               exec_lo, 0xFFFFFFFF                                                                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
705     s_and_b32       m0, s_restore_size, 1
706     s_cmp_eq_u32    m0, 1
707     s_cbranch_scc1  L_ENABLE_RESTORE_VGPR_EXEC_HI   
708     s_mov_b32           exec_hi, 0x00000000
709     s_branch        L_RESTORE_VGPR_NORMAL
710   L_ENABLE_RESTORE_VGPR_EXEC_HI:
711         s_mov_b32               exec_hi, 0xFFFFFFFF
712   L_RESTORE_VGPR_NORMAL:        
713         s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)        //vpgr_size
714         s_add_u32               s_restore_alloc_size, s_restore_alloc_size, 1
715         s_lshl_b32              s_restore_alloc_size, s_restore_alloc_size, 2                                                   //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
716     //determine it is wave32 or wave64
717     s_and_b32       m0, s_restore_size, 1
718     s_cmp_eq_u32    m0, 1
719     s_cbranch_scc1  L_RESTORE_VGPR_WAVE64
720
721     s_lshl_b32          s_restore_buf_rsrc2,  s_restore_alloc_size, 7                                               //NUM_RECORDS in bytes (32 threads*4)
722         if (SWIZZLE_EN)
723                 s_add_u32               s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                                           //FIXME need to use swizzle to enable bounds checking?
724         else
725                 s_mov_b32               s_restore_buf_rsrc2,  0x1000000                                                                         //NUM_RECORDS in bytes
726         end     
727
728         s_mov_b32               s_restore_mem_offset_save, s_restore_mem_offset                                                 // restore start with v1, v0 will be the last
729         s_add_u32               s_restore_mem_offset, s_restore_mem_offset, 128
730     s_mov_b32           m0, 1                                                                                                                                   //VGPR initial index value = 1
731         //s_set_gpr_idx_on  m0, 0x8                                                                                                                             //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
732     //s_add_u32         s_restore_alloc_size, s_restore_alloc_size, 0x8000                                              //add 0x8000 since we compare m0 against it later, might not need this in gfx10 
733
734   L_RESTORE_VGPR_WAVE32_LOOP:                                                                           
735     if(USE_MTBUF_INSTEAD_OF_MUBUF)       
736                 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
737     else
738                 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset     slc:1 glc:1     
739         end
740         s_waitcnt               vmcnt(0)                                                                                                                                //ensure data ready
741         v_movreld_b32           v0, v0                                                                                                                                  //v[0+m0] = v0
742     s_add_u32           m0, m0, 1                                                                                                                               //next vgpr index
743         s_add_u32               s_restore_mem_offset, s_restore_mem_offset, 128                                                 //every buffer_load_dword does 128 bytes
744         s_cmp_lt_u32    m0,     s_restore_alloc_size                                                                                            //scc = (m0 < s_restore_alloc_size) ? 1 : 0
745         s_cbranch_scc1  L_RESTORE_VGPR_WAVE32_LOOP                                                                                                              //VGPR restore (except v0) is complete?
746         //s_set_gpr_idx_off
747                                                                                                                                                                                         /* VGPR restore on v0 */
748     if(USE_MTBUF_INSTEAD_OF_MUBUF)       
749                 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
750     else
751                 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save        slc:1 glc:1     
752         end
753
754     s_branch    L_RESTORE_LDS
755
756   L_RESTORE_VGPR_WAVE64:
757     s_lshl_b32          s_restore_buf_rsrc2,  s_restore_alloc_size, 8                                               //NUM_RECORDS in bytes (64 threads*4)
758         if (SWIZZLE_EN)
759                 s_add_u32               s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                                           //FIXME need to use swizzle to enable bounds checking?
760         else
761                 s_mov_b32               s_restore_buf_rsrc2,  0x1000000                                                                         //NUM_RECORDS in bytes
762         end     
763
764         s_mov_b32               s_restore_mem_offset_save, s_restore_mem_offset                                                 // restore start with v1, v0 will be the last
765         s_add_u32               s_restore_mem_offset, s_restore_mem_offset, 256
766     s_mov_b32           m0, 1                                                                                                                                   //VGPR initial index value = 1
767   L_RESTORE_VGPR_WAVE64_LOOP:                                                                           
768     if(USE_MTBUF_INSTEAD_OF_MUBUF)       
769                 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
770     else
771                 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset     slc:1 glc:1     
772         end
773         s_waitcnt               vmcnt(0)                                                                                                                                //ensure data ready
774         v_movreld_b32           v0, v0                                                                                                                                  //v[0+m0] = v0
775     s_add_u32           m0, m0, 1                                                                                                                               //next vgpr index
776         s_add_u32               s_restore_mem_offset, s_restore_mem_offset, 256                                                 //every buffer_load_dword does 256 bytes
777         s_cmp_lt_u32    m0,     s_restore_alloc_size                                                                                            //scc = (m0 < s_restore_alloc_size) ? 1 : 0
778         s_cbranch_scc1  L_RESTORE_VGPR_WAVE64_LOOP                                                                                                              //VGPR restore (except v0) is complete?
779         //s_set_gpr_idx_off
780     //
781     //Below part will be the restore shared vgpr part (new for gfx10)
782     s_getreg_b32        s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)                  //shared_vgpr_size
783     s_and_b32           s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                          //shared_vgpr_size is zero?
784     s_cbranch_scc0      L_RESTORE_V0                                                                                                        //no shared_vgpr used? jump to L_SAVE_LDS
785     s_lshl_b32          s_restore_alloc_size, s_restore_alloc_size, 3                                           //Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
786     //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
787     //restore shared_vgpr will start from the index of m0
788     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, m0
789     s_mov_b32           exec_lo, 0xFFFFFFFF
790     s_mov_b32           exec_hi, 0x00000000
791     L_RESTORE_SHARED_VGPR_WAVE64_LOOP: 
792     buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
793     s_waitcnt           vmcnt(0)                                                                                                                                //ensure data ready
794         v_movreld_b32           v0, v0                                                                                                                                  //v[0+m0] = v0
795     s_add_u32           m0, m0, 1                                                                                                                               //next vgpr index
796         s_add_u32               s_restore_mem_offset, s_restore_mem_offset, 128                                                 //every buffer_load_dword does 256 bytes
797         s_cmp_lt_u32    m0,     s_restore_alloc_size                                                                                            //scc = (m0 < s_restore_alloc_size) ? 1 : 0
798         s_cbranch_scc1  L_RESTORE_SHARED_VGPR_WAVE64_LOOP                                                                                                               //VGPR restore (except v0) is complete?
799
800     s_mov_b32 exec_hi, 0xFFFFFFFF                                                           //restore back exec_hi before restoring V0!!
801         
802     /* VGPR restore on v0 */
803   L_RESTORE_V0:
804     if(USE_MTBUF_INSTEAD_OF_MUBUF)       
805                 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
806     else
807                 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save        slc:1 glc:1     
808         end
809
810
811     /*          restore LDS         */
812         //////////////////////////////
813   L_RESTORE_LDS:
814
815     //Only need to check the first wave    
816         /*      the first wave in the threadgroup    */
817         s_and_b32               s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK                       
818         s_cbranch_scc0  L_RESTORE_SGPR
819         
820     s_mov_b32           exec_lo, 0xFFFFFFFF                                                                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
821     s_and_b32       m0, s_restore_size, 1
822     s_cmp_eq_u32    m0, 1
823     s_cbranch_scc1  L_ENABLE_RESTORE_LDS_EXEC_HI   
824     s_mov_b32           exec_hi, 0x00000000
825     s_branch        L_RESTORE_LDS_NORMAL
826   L_ENABLE_RESTORE_LDS_EXEC_HI:
827         s_mov_b32               exec_hi, 0xFFFFFFFF
828   L_RESTORE_LDS_NORMAL: 
829         s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)                          //lds_size
830         s_and_b32               s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                                  //lds_size is zero?
831         s_cbranch_scc0  L_RESTORE_SGPR                                                                                                                  //no lds used? jump to L_RESTORE_VGPR
832         s_lshl_b32              s_restore_alloc_size, s_restore_alloc_size, 6                                                   //LDS size in dwords = lds_size * 64dw
833         s_lshl_b32              s_restore_alloc_size, s_restore_alloc_size, 2                                                   //LDS size in bytes
834         s_mov_b32               s_restore_buf_rsrc2,    s_restore_alloc_size                                                    //NUM_RECORDS in bytes
835         if (SWIZZLE_EN)
836                 s_add_u32               s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                                           //FIXME need to use swizzle to enable bounds checking?
837         else
838                 s_mov_b32               s_restore_buf_rsrc2,  0x1000000                                                                         //NUM_RECORDS in bytes
839         end
840
841     s_and_b32       m0, s_wave_size, 1
842     s_cmp_eq_u32    m0, 1
843     s_mov_b32           m0, 0x0
844     s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64
845
846   L_RESTORE_LDS_LOOP_W32:                                                                       
847         if (SAVE_LDS)
848         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
849     s_waitcnt 0
850         end
851     s_add_u32           m0, m0, 128                                                                                                                             //every buffer_load_dword does 256 bytes
852         s_add_u32               s_restore_mem_offset, s_restore_mem_offset, 128                                         //mem offset increased by 256 bytes
853         s_cmp_lt_u32    m0, s_restore_alloc_size                                                                                                //scc=(m0 < s_restore_alloc_size) ? 1 : 0
854         s_cbranch_scc1  L_RESTORE_LDS_LOOP_W32                                                                                                          //LDS restore is complete?
855     s_branch        L_RESTORE_SGPR
856
857   L_RESTORE_LDS_LOOP_W64:                                                                       
858         if (SAVE_LDS)
859         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
860     s_waitcnt 0
861         end
862     s_add_u32           m0, m0, 256                                                                                                                             //every buffer_load_dword does 256 bytes
863         s_add_u32               s_restore_mem_offset, s_restore_mem_offset, 256                                                 //mem offset increased by 256 bytes
864         s_cmp_lt_u32    m0, s_restore_alloc_size                                                                                                //scc=(m0 < s_restore_alloc_size) ? 1 : 0
865         s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64                                                                                                          //LDS restore is complete?
866
867         
868     /*          restore SGPRs       */
869         //////////////////////////////
870         //s_getreg_b32  s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                                //spgr_size
871         //s_add_u32             s_restore_alloc_size, s_restore_alloc_size, 1
872         //s_lshl_b32            s_restore_alloc_size, s_restore_alloc_size, 4                                                   //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
873         //s_lshl_b32            s_restore_alloc_size, s_restore_alloc_size, 3                                                   //Number of SGPRs = (sgpr_size + 1) * 8   (non-zero value)
874   L_RESTORE_SGPR:
875     //need to look at it is wave32 or wave64
876     s_and_b32       m0, s_restore_size, 1
877     s_cmp_eq_u32    m0, 1
878     s_cbranch_scc1  L_RESTORE_SGPR_VMEM_WAVE64
879         if (SGPR_SAVE_USE_SQC)
880                 s_lshl_b32              s_restore_buf_rsrc2,    s_sgpr_save_num, 2                                              //NUM_RECORDS in bytes 
881         else
882         s_lshl_b32              s_restore_buf_rsrc2,    s_sgpr_save_num, 7                                              //NUM_RECORDS in bytes (32 threads)
883     end
884     s_branch        L_RESTORE_SGPR_CONT
885   L_RESTORE_SGPR_VMEM_WAVE64:
886     if (SGPR_SAVE_USE_SQC)
887                 s_lshl_b32              s_restore_buf_rsrc2,    s_sgpr_save_num, 2                                              //NUM_RECORDS in bytes 
888         else
889                 s_lshl_b32              s_restore_buf_rsrc2,    s_sgpr_save_num, 8                                              //NUM_RECORDS in bytes (64 threads)
890         end
891
892   L_RESTORE_SGPR_CONT:
893         if (SWIZZLE_EN)
894                 s_add_u32               s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                                           //FIXME need to use swizzle to enable bounds checking?
895         else
896                 s_mov_b32               s_restore_buf_rsrc2,  0x1000000                                                                         //NUM_RECORDS in bytes
897         end
898
899     s_and_b32       m0, s_restore_size, 1
900     s_cmp_eq_u32    m0, 1
901     s_cbranch_scc1  L_RESTORE_SGPR_WAVE64
902
903     read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)              //save s0 to s_restore_tmp
904         s_mov_b32               m0, 0x1
905
906   L_RESTORE_SGPR_LOOP_WAVE32:
907     read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                                                                                                 //PV: further performance improvement can be made
908         s_waitcnt               lgkmcnt(0)                                                                                                                              //ensure data ready
909         s_movreld_b32   s0, s0                                                                  //s[0+m0] = s0
910     s_nop 0                                                                                 // hazard SALU M0=> S_MOVREL
911         s_add_u32               m0, m0, 1                                                                                                                               //next sgpr index
912         s_cmp_lt_u32    m0, s_sgpr_save_num                                                                                             //scc = (m0 < s_restore_alloc_size) ? 1 : 0
913         s_cbranch_scc1  L_RESTORE_SGPR_LOOP_WAVE32                                                                                                              //SGPR restore (except s0) is complete?
914         s_mov_b32               s0, s_restore_tmp                                                                                                                       /* SGPR restore on s0 */
915     s_branch        L_RESTORE_HWREG
916   
917   L_RESTORE_SGPR_WAVE64:
918         read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)          //save s0 to s_restore_tmp
919         s_mov_b32               m0, 0x1                                                                                                                                                         //SGPR initial index value =1   //go on with with s1
920         
921   L_RESTORE_SGPR_LOOP_WAVE64:                                                                                                                                                                   
922         read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                                                                                                     //PV: further performance improvement can be made
923         s_waitcnt               lgkmcnt(0)                                                                                                                              //ensure data ready
924         s_movreld_b32   s0, s0                                                                  //s[0+m0] = s0
925     s_nop 0                                                                                 // hazard SALU M0=> S_MOVREL
926         s_add_u32               m0, m0, 1                                                                                                                               //next sgpr index
927         s_cmp_lt_u32    m0, s_sgpr_save_num                                                                                             //scc = (m0 < s_restore_alloc_size) ? 1 : 0
928         s_cbranch_scc1  L_RESTORE_SGPR_LOOP_WAVE64                                                                                                              //SGPR restore (except s0) is complete?
929         s_mov_b32               s0, s_restore_tmp                                                                                                                       /* SGPR restore on s0 */
930
931         
932     /*          restore HW registers    */
933         //////////////////////////////
934   L_RESTORE_HWREG:
935     s_mov_b32           s_restore_buf_rsrc2, 0x4                                                                                                //NUM_RECORDS   in bytes
936         if (SWIZZLE_EN)
937                 s_add_u32               s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                                           //FIXME need to use swizzle to enable bounds checking?
938         else
939                 s_mov_b32               s_restore_buf_rsrc2,  0x1000000                                                                         //NUM_RECORDS in bytes
940         end
941
942     s_and_b32       m0, s_restore_size, 1
943     s_cmp_eq_u32    m0, 1
944     s_cbranch_scc1  L_RESTORE_HWREG_WAVE64
945
946     read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                       //M0
947         read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                //PC
948         read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
949         read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                              //EXEC
950         read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
951         read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                               //STATUS
952         read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                              //TRAPSTS
953     //read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                    //XNACK_MASK_LO
954         //read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                        //XNACK_MASK_HI
955     read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                       //XNACK_MASK
956         read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                         //MODE
957     if(SAVE_RESTORE_HWID_DDID)
958     read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                            //HW_ID1
959     end
960     s_branch        L_RESTORE_HWREG_FINISH
961
962   L_RESTORE_HWREG_WAVE64:
963         read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                   //M0
964         read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                //PC
965         read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
966         read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                              //EXEC
967         read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
968         read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                               //STATUS
969         read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                              //TRAPSTS
970     //read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                    //XNACK_MASK_LO
971         //read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                        //XNACK_MASK_HI
972     read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                                       //XNACK_MASK
973         read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                         //MODE
974     if(SAVE_RESTORE_HWID_DDID)
975     read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)                            //HW_ID1
976     end
977   L_RESTORE_HWREG_FINISH:
978         s_waitcnt               lgkmcnt(0)                                                                                                                                                                              //from now on, it is safe to restore STATUS and IB_STS
979   
980
981
982     if(SAVE_RESTORE_HWID_DDID)
983   L_RESTORE_DDID:
984     s_mov_b32      m0, s_restore_hwid1                                                      //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave
985     s_ttracedata                                                                            //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore
986                                     
987     s_mov_b32           s_restore_buf_rsrc2, 0x4                                                                                                //NUM_RECORDS   in bytes
988         if (SWIZZLE_EN)
989                 s_add_u32               s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                                           //FIXME need to use swizzle to enable bounds checking?
990         else
991                 s_mov_b32               s_restore_buf_rsrc2,  0x1000000                                                                         //NUM_RECORDS in bytes
992         end
993
994     s_and_b32       m0, s_restore_size, 1
995     s_cmp_eq_u32    m0, 1
996     s_cbranch_scc1  L_RESTORE_DDID_WAVE64
997
998     read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)     
999     s_branch        L_RESTORE_DDID_FINISH
1000   L_RESTORE_DDID_WAVE64:
1001     read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)     
1002
1003   L_RESTORE_DDID_FINISH:
1004     s_waitcnt           lgkmcnt(0)
1005     //s_mov_b32      m0, s_restore_ddid
1006     //s_ttracedata   
1007     if (RESTORE_DDID_IN_SGPR18)
1008         s_mov_b32   s18, s_restore_ddid
1009         end     
1010     
1011     end   
1012
1013         s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff          //pc[47:32]        //Do it here in order not to affect STATUS
1014
1015         //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
1016         if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
1017                 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
1018                 s_addc_u32      s_restore_pc_hi, s_restore_pc_hi, 0x0            //carry bit over
1019         end     
1020         if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))          
1021                 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
1022                 s_addc_u32      s_restore_pc_hi, s_restore_pc_hi, 0x0            //carry bit over
1023         end
1024         
1025         s_mov_b32               m0,             s_restore_m0
1026         s_mov_b32               exec_lo,        s_restore_exec_lo
1027         s_mov_b32               exec_hi,        s_restore_exec_hi
1028         
1029         s_and_b32               s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1030         s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1031     s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask         //restore xnack_mask
1032         s_and_b32               s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1033         s_lshr_b32              s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1034         s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1035         //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
1036         s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
1037         //reuse s_restore_m0 as a temp register
1038         s_and_b32               s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
1039         s_lshr_b32              s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
1040         s_lshl_b32              s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
1041         s_mov_b32               s_restore_tmp, 0x0                                                                                                                                                              //IB_STS is zero
1042         s_or_b32                s_restore_tmp, s_restore_tmp, s_restore_m0
1043         s_and_b32               s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
1044         s_lshr_b32              s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
1045         s_lshl_b32              s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
1046         s_or_b32                s_restore_tmp, s_restore_tmp, s_restore_m0
1047     s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK 
1048     s_lshr_b32          s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
1049         s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
1050         s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status
1051
1052         s_barrier                                                                                                       //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
1053         
1054         
1055 //      s_rfe_b64 s_restore_pc_lo                                       //Return to the main shader program and resume execution
1056     s_rfe_b64  s_restore_pc_lo            // s_restore_m0[0] is used to set STATUS.inst_atc 
1057
1058
1059 /**************************************************************************/
1060 /*                      the END                                                                       */
1061 /**************************************************************************/    
1062 L_END_PGM:      
1063         s_endpgm
1064         
1065 end     
1066
1067
1068 /**************************************************************************/
1069 /*                      the helper functions                                                      */
1070 /**************************************************************************/
1071 function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
1072         if (use_sqc)
1073                 s_mov_b32 exec_lo, m0                                   //assuming exec_lo is not needed anymore from this point on
1074                 s_mov_b32 m0, s_mem_offset
1075                 s_buffer_store_dword s, s_rsrc, m0              glc:1   
1076                 s_add_u32               s_mem_offset, s_mem_offset, 4
1077                 s_mov_b32       m0, exec_lo
1078     elsif (use_mtbuf)
1079         v_mov_b32       v0,     s
1080         tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
1081                 s_add_u32               s_mem_offset, s_mem_offset, 128
1082     else 
1083         v_mov_b32       v0,     s
1084                 buffer_store_dword      v0, v0, s_rsrc, s_mem_offset    slc:1 glc:1
1085         s_add_u32               s_mem_offset, s_mem_offset, 128
1086         end
1087 end
1088
1089 function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
1090         if (use_sqc)
1091                 s_mov_b32 exec_lo, m0                                   //assuming exec_lo is not needed anymore from this point on
1092                 s_mov_b32 m0, s_mem_offset
1093                 s_buffer_store_dword s, s_rsrc, m0              glc:1   
1094                 s_add_u32               s_mem_offset, s_mem_offset, 4
1095                 s_mov_b32       m0, exec_lo
1096     elsif (use_mtbuf)
1097         v_mov_b32       v0,     s
1098         tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
1099                 s_add_u32               s_mem_offset, s_mem_offset, 256
1100     else 
1101         v_mov_b32       v0,     s
1102                 buffer_store_dword      v0, v0, s_rsrc, s_mem_offset    slc:1 glc:1
1103         s_add_u32               s_mem_offset, s_mem_offset, 256
1104         end
1105 end
1106
1107 function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc)
1108         s_buffer_load_dword s, s_rsrc, s_mem_offset             glc:1
1109         if (use_sqc)
1110                 s_add_u32               s_mem_offset, s_mem_offset, 4
1111         else
1112         s_add_u32               s_mem_offset, s_mem_offset, 128
1113         end
1114 end
1115
1116 function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc)
1117         s_buffer_load_dword s, s_rsrc, s_mem_offset             glc:1
1118         if (use_sqc)
1119                 s_add_u32               s_mem_offset, s_mem_offset, 4
1120         else
1121         s_add_u32               s_mem_offset, s_mem_offset, 256
1122         end
1123 end
1124