]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
drm/amdgpu: resolve bug in UMC 6 error counter query
[linux.git] / drivers / gpu / drm / amd / amdgpu / umc_v6_1.c
1 /*
2  * Copyright 2019 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v6_1.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu.h"
26
27 #include "rsmu/rsmu_0_0_2_offset.h"
28 #include "rsmu/rsmu_0_0_2_sh_mask.h"
29 #include "umc/umc_6_1_1_offset.h"
30 #include "umc/umc_6_1_1_sh_mask.h"
31 #include "umc/umc_6_1_2_offset.h"
32
33 #define smnMCA_UMC0_MCUMC_ADDRT0        0x50f10
34
35 #define UMC_6_INST_DIST                 0x40000
36
37 /*
38  * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
39  * is the index of 8KB block
40  */
41 #define ADDR_OF_8KB_BLOCK(addr)                 (((addr) & ~0xffULL) << 5)
42 /* channel index is the index of 256B block */
43 #define ADDR_OF_256B_BLOCK(channel_index)       ((channel_index) << 8)
44 /* offset in 256B block */
45 #define OFFSET_IN_256B_BLOCK(addr)              ((addr) & 0xffULL)
46
47 const uint32_t
48         umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM] = {
49                 {2, 18, 11, 27},        {4, 20, 13, 29},
50                 {1, 17, 8, 24},         {7, 23, 14, 30},
51                 {10, 26, 3, 19},        {12, 28, 5, 21},
52                 {9, 25, 0, 16},         {15, 31, 6, 22}
53 };
54
55 static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev,
56                                             uint32_t umc_inst,
57                                             uint32_t ch_inst)
58 {
59         return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst;
60 }
61
62 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
63                                                    uint32_t umc_reg_offset,
64                                                    unsigned long *error_count)
65 {
66         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
67         uint32_t ecc_err_cnt, ecc_err_cnt_addr;
68         uint64_t mc_umc_status;
69         uint32_t mc_umc_status_addr;
70
71         if (adev->asic_type == CHIP_ARCTURUS) {
72                 /* UMC 6_1_2 registers */
73                 ecc_err_cnt_sel_addr =
74                         SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
75                 ecc_err_cnt_addr =
76                         SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
77                 mc_umc_status_addr =
78                         SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
79         } else {
80                 /* UMC 6_1_1 registers */
81                 ecc_err_cnt_sel_addr =
82                         SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
83                 ecc_err_cnt_addr =
84                         SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
85                 mc_umc_status_addr =
86                         SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
87         }
88
89         /* select the lower chip and check the error count */
90         ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset);
91         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
92                                         EccErrCntCsSel, 0);
93         WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel);
94         ecc_err_cnt = RREG32(ecc_err_cnt_addr + umc_reg_offset);
95         *error_count +=
96                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
97                  UMC_V6_1_CE_CNT_INIT);
98         /* clear the lower chip err count */
99         WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT);
100
101         /* select the higher chip and check the err counter */
102         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
103                                         EccErrCntCsSel, 1);
104         WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel);
105         ecc_err_cnt = RREG32(ecc_err_cnt_addr + umc_reg_offset);
106         *error_count +=
107                 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
108                  UMC_V6_1_CE_CNT_INIT);
109         /* clear the higher chip err count */
110         WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT);
111
112         /* check for SRAM correctable error
113           MCUMC_STATUS is a 64 bit register */
114         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
115         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
116             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
117             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
118                 *error_count += 1;
119 }
120
121 static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev,
122                                                       uint32_t umc_reg_offset,
123                                                       unsigned long *error_count)
124 {
125         uint64_t mc_umc_status;
126         uint32_t mc_umc_status_addr;
127
128         if (adev->asic_type == CHIP_ARCTURUS) {
129                 /* UMC 6_1_2 registers */
130                 mc_umc_status_addr =
131                         SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
132         } else {
133                 /* UMC 6_1_1 registers */
134                 mc_umc_status_addr =
135                         SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
136         }
137
138         /* check the MCUMC_STATUS */
139         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
140         if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
141             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
142             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
143             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
144             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
145             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
146                 *error_count += 1;
147 }
148
149 static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev,
150                                            void *ras_error_status)
151 {
152         struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
153
154         uint32_t umc_inst        = 0;
155         uint32_t ch_inst         = 0;
156         uint32_t umc_reg_offset  = 0;
157
158         for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) {
159                 for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) {
160                         umc_reg_offset = get_umc_6_reg_offset(adev,
161                                                               umc_inst,
162                                                               ch_inst);
163
164                         umc_v6_1_query_correctable_error_count(adev,
165                                                                umc_reg_offset,
166                                                                &(err_data->ce_count));
167                         umc_v6_1_querry_uncorrectable_error_count(adev,
168                                                                   umc_reg_offset,
169                                                                   &(err_data->ue_count));
170                 }
171         }
172 }
173
174 static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
175                                          struct ras_err_data *err_data,
176                                          uint32_t umc_reg_offset,
177                                          uint32_t channel_index,
178                                          uint32_t umc_inst)
179 {
180         uint32_t lsb, mc_umc_status_addr;
181         uint64_t mc_umc_status, err_addr, retired_page;
182         struct eeprom_table_record *err_rec;
183
184         if (adev->asic_type == CHIP_ARCTURUS) {
185                 /* UMC 6_1_2 registers */
186                 mc_umc_status_addr =
187                         SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
188         } else {
189                 /* UMC 6_1_1 registers */
190                 mc_umc_status_addr =
191                         SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
192         }
193
194         /* skip error address process if -ENOMEM */
195         if (!err_data->err_addr) {
196                 /* clear umc status */
197                 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
198                 return;
199         }
200
201         err_rec = &err_data->err_addr[err_data->err_addr_cnt];
202         mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
203
204         /* calculate error address if ue/ce error is detected */
205         if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
206             (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
207             REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
208                 err_addr = RREG64_PCIE(smnMCA_UMC0_MCUMC_ADDRT0 + umc_reg_offset * 4);
209
210                 /* the lowest lsb bits should be ignored */
211                 lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
212                 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
213                 err_addr &= ~((0x1ULL << lsb) - 1);
214
215                 /* translate umc channel address to soc pa, 3 parts are included */
216                 retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
217                                 ADDR_OF_256B_BLOCK(channel_index) |
218                                 OFFSET_IN_256B_BLOCK(err_addr);
219
220                 /* we only save ue error information currently, ce is skipped */
221                 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
222                                 == 1) {
223                         err_rec->address = err_addr;
224                         /* page frame address is saved */
225                         err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
226                         err_rec->ts = (uint64_t)ktime_get_real_seconds();
227                         err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
228                         err_rec->cu = 0;
229                         err_rec->mem_channel = channel_index;
230                         err_rec->mcumc_id = umc_inst;
231
232                         err_data->err_addr_cnt++;
233                 }
234         }
235
236         /* clear umc status */
237         WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
238 }
239
240 static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev,
241                                              void *ras_error_status)
242 {
243         struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
244
245         uint32_t umc_inst        = 0;
246         uint32_t ch_inst         = 0;
247         uint32_t umc_reg_offset  = 0;
248
249         for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) {
250                 for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) {
251                         umc_reg_offset = get_umc_6_reg_offset(adev,
252                                                               umc_inst,
253                                                               ch_inst);
254
255                         umc_v6_1_query_error_address(adev,
256                                                      err_data,
257                                                      umc_reg_offset,
258                                                      ch_inst,
259                                                      umc_inst);
260                 }
261         }
262
263 }
264
265 static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev,
266                                               uint32_t umc_reg_offset)
267 {
268         uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
269         uint32_t ecc_err_cnt_addr;
270
271         if (adev->asic_type == CHIP_ARCTURUS) {
272                 /* UMC 6_1_2 registers */
273                 ecc_err_cnt_sel_addr =
274                         SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
275                 ecc_err_cnt_addr =
276                         SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
277         } else {
278                 /* UMC 6_1_1 registers */
279                 ecc_err_cnt_sel_addr =
280                         SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
281                 ecc_err_cnt_addr =
282                         SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
283         }
284
285         /* select the lower chip and check the error count */
286         ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset);
287         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
288                                         EccErrCntCsSel, 0);
289         /* set ce error interrupt type to APIC based interrupt */
290         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
291                                         EccErrInt, 0x1);
292         WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel);
293         /* set error count to initial value */
294         WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT);
295
296         /* select the higher chip and check the err counter */
297         ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
298                                         EccErrCntCsSel, 1);
299         WREG32(ecc_err_cnt_sel_addr + umc_reg_offset, ecc_err_cnt_sel);
300         WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT);
301 }
302
303 static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev)
304 {
305         uint32_t umc_inst        = 0;
306         uint32_t ch_inst         = 0;
307         uint32_t umc_reg_offset  = 0;
308
309         for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) {
310                 for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) {
311                         umc_reg_offset = get_umc_6_reg_offset(adev,
312                                                               umc_inst,
313                                                               ch_inst);
314
315                         umc_v6_1_err_cnt_init_per_channel(adev, umc_reg_offset);
316                 }
317         }
318 }
319
320 const struct amdgpu_umc_funcs umc_v6_1_funcs = {
321         .err_cnt_init = umc_v6_1_err_cnt_init,
322         .ras_late_init = amdgpu_umc_ras_late_init,
323         .query_ras_error_count = umc_v6_1_query_ras_error_count,
324         .query_ras_error_address = umc_v6_1_query_ras_error_address,
325 };