ras_tests.c revision 88f8a8d2
1/* 2 * Copyright 2017 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22*/ 23 24#include "CUnit/Basic.h" 25 26#include "amdgpu_test.h" 27#include "amdgpu_drm.h" 28#include "amdgpu_internal.h" 29#include <unistd.h> 30#include <fcntl.h> 31#include <stdio.h> 32#include "xf86drm.h" 33 34#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) 35 36const char *ras_block_string[] = { 37 "umc", 38 "sdma", 39 "gfx", 40 "mmhub", 41 "athub", 42 "pcie_bif", 43 "hdp", 44 "xgmi_wafl", 45 "df", 46 "smn", 47 "sem", 48 "mp0", 49 "mp1", 50 "fuse", 51}; 52 53#define ras_block_str(i) (ras_block_string[i]) 54 55enum amdgpu_ras_block { 56 AMDGPU_RAS_BLOCK__UMC = 0, 57 AMDGPU_RAS_BLOCK__SDMA, 58 AMDGPU_RAS_BLOCK__GFX, 59 AMDGPU_RAS_BLOCK__MMHUB, 60 AMDGPU_RAS_BLOCK__ATHUB, 61 AMDGPU_RAS_BLOCK__PCIE_BIF, 62 AMDGPU_RAS_BLOCK__HDP, 63 AMDGPU_RAS_BLOCK__XGMI_WAFL, 64 AMDGPU_RAS_BLOCK__DF, 65 AMDGPU_RAS_BLOCK__SMN, 66 AMDGPU_RAS_BLOCK__SEM, 67 AMDGPU_RAS_BLOCK__MP0, 68 AMDGPU_RAS_BLOCK__MP1, 69 AMDGPU_RAS_BLOCK__FUSE, 70 71 AMDGPU_RAS_BLOCK__LAST 72}; 73 74#define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST 75#define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) 76 77enum amdgpu_ras_gfx_subblock { 78 /* CPC */ 79 AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START = 0, 80 AMDGPU_RAS_BLOCK__GFX_CPC_SCRATCH = 81 AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START, 82 AMDGPU_RAS_BLOCK__GFX_CPC_UCODE, 83 AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME1, 84 AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME1, 85 AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME1, 86 AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME2, 87 AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME2, 88 AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2, 89 AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_END = 90 AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2, 91 /* CPF */ 92 AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START, 93 AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME2 = 94 AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START, 95 AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME1, 96 AMDGPU_RAS_BLOCK__GFX_CPF_TAG, 97 AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPF_TAG, 98 /* CPG */ 99 AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START, 100 AMDGPU_RAS_BLOCK__GFX_CPG_DMA_ROQ = 101 AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START, 102 AMDGPU_RAS_BLOCK__GFX_CPG_DMA_TAG, 103 AMDGPU_RAS_BLOCK__GFX_CPG_TAG, 104 AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPG_TAG, 105 /* GDS */ 106 AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START, 107 AMDGPU_RAS_BLOCK__GFX_GDS_MEM = AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START, 108 AMDGPU_RAS_BLOCK__GFX_GDS_INPUT_QUEUE, 109 AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_CMD_RAM_MEM, 110 AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_DATA_RAM_MEM, 111 AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM, 112 AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_END = 113 AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM, 114 /* SPI */ 115 AMDGPU_RAS_BLOCK__GFX_SPI_SR_MEM, 116 /* SQ */ 117 AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START, 118 AMDGPU_RAS_BLOCK__GFX_SQ_SGPR = AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START, 119 AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D, 120 AMDGPU_RAS_BLOCK__GFX_SQ_LDS_I, 121 AMDGPU_RAS_BLOCK__GFX_SQ_VGPR, 122 AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_END = AMDGPU_RAS_BLOCK__GFX_SQ_VGPR, 123 /* SQC (3 ranges) */ 124 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START, 125 /* SQC range 0 */ 126 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START = 127 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START, 128 AMDGPU_RAS_BLOCK__GFX_SQC_INST_UTCL1_LFIFO = 129 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START, 130 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_WRITE_DATA_BUF, 131 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_UTCL1_LFIFO, 132 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_WRITE_DATA_BUF, 133 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO, 134 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_WRITE_DATA_BUF, 135 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO, 136 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_END = 137 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO, 138 /* SQC range 1 */ 139 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START, 140 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM = 141 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START, 142 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO, 143 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_MISS_FIFO, 144 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_BANK_RAM, 145 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_TAG_RAM, 146 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_HIT_FIFO, 147 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_MISS_FIFO, 148 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM, 149 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM, 150 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_END = 151 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM, 152 /* SQC range 2 */ 153 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START, 154 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM = 155 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START, 156 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO, 157 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_MISS_FIFO, 158 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_BANK_RAM, 159 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_TAG_RAM, 160 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_HIT_FIFO, 161 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_MISS_FIFO, 162 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM, 163 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM, 164 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END = 165 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM, 166 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_END = 167 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END, 168 /* TA */ 169 AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START, 170 AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO = 171 AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START, 172 AMDGPU_RAS_BLOCK__GFX_TA_FS_AFIFO, 173 AMDGPU_RAS_BLOCK__GFX_TA_FL_LFIFO, 174 AMDGPU_RAS_BLOCK__GFX_TA_FX_LFIFO, 175 AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO, 176 AMDGPU_RAS_BLOCK__GFX_TA_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO, 177 /* TCA */ 178 AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START, 179 AMDGPU_RAS_BLOCK__GFX_TCA_HOLE_FIFO = 180 AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START, 181 AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO, 182 AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_END = 183 AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO, 184 /* TCC (5 sub-ranges) */ 185 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START, 186 /* TCC range 0 */ 187 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START = 188 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START, 189 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA = 190 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START, 191 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1, 192 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0, 193 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1, 194 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_0, 195 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_1, 196 AMDGPU_RAS_BLOCK__GFX_TCC_HIGH_RATE_TAG, 197 AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG, 198 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_END = 199 AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG, 200 /* TCC range 1 */ 201 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START, 202 AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_DEC = 203 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START, 204 AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER, 205 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_END = 206 AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER, 207 /* TCC range 2 */ 208 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START, 209 AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_DATA = 210 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START, 211 AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_CONTROL, 212 AMDGPU_RAS_BLOCK__GFX_TCC_UC_ATOMIC_FIFO, 213 AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_RETURN, 214 AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_CACHE_READ, 215 AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO, 216 AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO_NEXT_RAM, 217 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO, 218 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_END = 219 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO, 220 /* TCC range 3 */ 221 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START, 222 AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO = 223 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START, 224 AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM, 225 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_END = 226 AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM, 227 /* TCC range 4 */ 228 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START, 229 AMDGPU_RAS_BLOCK__GFX_TCC_WRRET_TAG_WRITE_RETURN = 230 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START, 231 AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER, 232 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END = 233 AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER, 234 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_END = 235 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END, 236 /* TCI */ 237 AMDGPU_RAS_BLOCK__GFX_TCI_WRITE_RAM, 238 /* TCP */ 239 AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START, 240 AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM = 241 AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START, 242 AMDGPU_RAS_BLOCK__GFX_TCP_LFIFO_RAM, 243 AMDGPU_RAS_BLOCK__GFX_TCP_CMD_FIFO, 244 AMDGPU_RAS_BLOCK__GFX_TCP_VM_FIFO, 245 AMDGPU_RAS_BLOCK__GFX_TCP_DB_RAM, 246 AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO0, 247 AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1, 248 AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_END = 249 AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1, 250 /* TD */ 251 AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START, 252 AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO = 253 AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START, 254 AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_HI, 255 AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO, 256 AMDGPU_RAS_BLOCK__GFX_TD_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO, 257 /* EA (3 sub-ranges) */ 258 AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START, 259 /* EA range 0 */ 260 AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START = 261 AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START, 262 AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM = 263 AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START, 264 AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_CMDMEM, 265 AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_DATAMEM, 266 AMDGPU_RAS_BLOCK__GFX_EA_RRET_TAGMEM, 267 AMDGPU_RAS_BLOCK__GFX_EA_WRET_TAGMEM, 268 AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_CMDMEM, 269 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_CMDMEM, 270 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM, 271 AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_END = 272 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM, 273 /* EA range 1 */ 274 AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START, 275 AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_PAGEMEM = 276 AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START, 277 AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_PAGEMEM, 278 AMDGPU_RAS_BLOCK__GFX_EA_IORD_CMDMEM, 279 AMDGPU_RAS_BLOCK__GFX_EA_IOWR_CMDMEM, 280 AMDGPU_RAS_BLOCK__GFX_EA_IOWR_DATAMEM, 281 AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_PAGEMEM, 282 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM, 283 AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_END = 284 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM, 285 /* EA range 2 */ 286 AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START, 287 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D0MEM = 288 AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START, 289 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D1MEM, 290 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D2MEM, 291 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM, 292 AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END = 293 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM, 294 AMDGPU_RAS_BLOCK__GFX_EA_INDEX_END = 295 AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END, 296 /* UTC VM L2 bank */ 297 AMDGPU_RAS_BLOCK__UTC_VML2_BANK_CACHE, 298 /* UTC VM walker */ 299 AMDGPU_RAS_BLOCK__UTC_VML2_WALKER, 300 /* UTC ATC L2 2MB cache */ 301 AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_2M_BANK, 302 /* UTC ATC L2 4KB cache */ 303 AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_4K_BANK, 304 AMDGPU_RAS_BLOCK__GFX_MAX 305}; 306 307enum amdgpu_ras_error_type { 308 AMDGPU_RAS_ERROR__NONE = 0, 309 AMDGPU_RAS_ERROR__PARITY = 1, 310 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2, 311 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4, 312 AMDGPU_RAS_ERROR__POISON = 8, 313}; 314 315struct ras_inject_test_config { 316 char name[64]; 317 char block[32]; 318 int sub_block; 319 enum amdgpu_ras_error_type type; 320 uint64_t address; 321 uint64_t value; 322}; 323 324struct ras_common_if { 325 enum amdgpu_ras_block block; 326 enum amdgpu_ras_error_type type; 327 uint32_t sub_block_index; 328 char name[32]; 329}; 330 331struct ras_inject_if { 332 struct ras_common_if head; 333 uint64_t address; 334 uint64_t value; 335}; 336 337struct ras_debug_if { 338 union { 339 struct ras_common_if head; 340 struct ras_inject_if inject; 341 }; 342 int op; 343}; 344/* for now, only umc, gfx, sdma has implemented. */ 345#define DEFAULT_RAS_BLOCK_MASK_INJECT ((1 << AMDGPU_RAS_BLOCK__UMC) |\ 346 (1 << AMDGPU_RAS_BLOCK__GFX)) 347#define DEFAULT_RAS_BLOCK_MASK_QUERY ((1 << AMDGPU_RAS_BLOCK__UMC) |\ 348 (1 << AMDGPU_RAS_BLOCK__GFX)) 349#define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\ 350 (1 << AMDGPU_RAS_BLOCK__SDMA) |\ 351 (1 << AMDGPU_RAS_BLOCK__GFX)) 352 353static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT; 354static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT; 355static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC; 356 357struct ras_test_mask { 358 uint32_t inject_mask; 359 uint32_t query_mask; 360 uint32_t basic_mask; 361}; 362 363struct amdgpu_ras_data { 364 amdgpu_device_handle device_handle; 365 uint32_t id; 366 uint32_t capability; 367 struct ras_test_mask test_mask; 368}; 369 370/* all devices who has ras supported */ 371static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED]; 372static int devices_count; 373 374struct ras_DID_test_mask{ 375 uint16_t device_id; 376 uint16_t revision_id; 377 struct ras_test_mask test_mask; 378}; 379 380/* white list for inject test. */ 381#define RAS_BLOCK_MASK_ALL {\ 382 DEFAULT_RAS_BLOCK_MASK_INJECT,\ 383 DEFAULT_RAS_BLOCK_MASK_QUERY,\ 384 DEFAULT_RAS_BLOCK_MASK_BASIC\ 385} 386 387#define RAS_BLOCK_MASK_QUERY_BASIC {\ 388 0,\ 389 DEFAULT_RAS_BLOCK_MASK_QUERY,\ 390 DEFAULT_RAS_BLOCK_MASK_BASIC\ 391} 392 393static const struct ras_inject_test_config umc_ras_inject_test[] = { 394 {"ras_umc.1.0", "umc", 0, AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 395}; 396 397static const struct ras_inject_test_config gfx_ras_inject_test[] = { 398 {"ras_gfx.2.0", "gfx", AMDGPU_RAS_BLOCK__GFX_CPC_UCODE, 399 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 400 {"ras_gfx.2.1", "gfx", AMDGPU_RAS_BLOCK__GFX_CPF_TAG, 401 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 402 {"ras_gfx.2.2", "gfx", AMDGPU_RAS_BLOCK__GFX_CPG_TAG, 403 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 404 {"ras_gfx.2.3", "gfx", AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D, 405 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 406 {"ras_gfx.2.4", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO, 407 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 408 {"ras_gfx.2.5", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM, 409 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 410 {"ras_gfx.2.6", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM, 411 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 412 {"ras_gfx.2.7", "gfx", AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO, 413 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 414 {"ras_gfx.2.8", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA, 415 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 416 {"ras_gfx.2.9", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1, 417 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 418 {"ras_gfx.2.10", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0, 419 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 420 {"ras_gfx.2.11", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1, 421 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 422 {"ras_gfx.2.12", "gfx", AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM, 423 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 424 {"ras_gfx.2.13", "gfx", AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO, 425 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 426 {"ras_gfx.2.14", "gfx", AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM, 427 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 428}; 429 430static const struct ras_DID_test_mask ras_DID_array[] = { 431 {0x66a1, 0x00, RAS_BLOCK_MASK_ALL}, 432 {0x66a1, 0x01, RAS_BLOCK_MASK_ALL}, 433 {0x66a1, 0x04, RAS_BLOCK_MASK_ALL}, 434}; 435 436static uint32_t amdgpu_ras_find_block_id_by_name(const char *name) 437{ 438 int i; 439 440 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 441 if (strcmp(name, ras_block_string[i]) == 0) 442 return i; 443 } 444 445 return ARRAY_SIZE(ras_block_string); 446} 447 448static char *amdgpu_ras_get_error_type_id(enum amdgpu_ras_error_type type) 449{ 450 switch (type) { 451 case AMDGPU_RAS_ERROR__PARITY: 452 return "parity"; 453 case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: 454 return "single_correctable"; 455 case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: 456 return "multi_uncorrectable"; 457 case AMDGPU_RAS_ERROR__POISON: 458 return "poison"; 459 case AMDGPU_RAS_ERROR__NONE: 460 default: 461 return NULL; 462 } 463} 464 465static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device) 466{ 467 int i; 468 static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC; 469 470 for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) { 471 if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id && 472 ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id) 473 return ras_DID_array[i].test_mask; 474 } 475 return default_test_mask; 476} 477 478static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle) 479{ 480 union { 481 uint64_t feature_mask; 482 struct { 483 uint32_t enabled_features; 484 uint32_t supported_features; 485 }; 486 } features = { 0 }; 487 int ret; 488 489 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 490 sizeof(features), &features); 491 if (ret) 492 return 0; 493 494 return features.supported_features; 495} 496 497static int get_file_contents(char *file, char *buf, int size); 498 499static int amdgpu_ras_lookup_id(drmDevicePtr device) 500{ 501 char path[1024]; 502 char str[128]; 503 drmPciBusInfo info; 504 int i; 505 int ret; 506 507 for (i = 0; i < MAX_CARDS_SUPPORTED; i++) { 508 memset(str, 0, sizeof(str)); 509 memset(&info, 0, sizeof(info)); 510 sprintf(path, "/sys/kernel/debug/dri/%d/name", i); 511 if (get_file_contents(path, str, sizeof(str)) <= 0) 512 continue; 513 514 ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx", 515 &info.domain, &info.bus, &info.dev, &info.func); 516 if (ret != 4) 517 continue; 518 519 if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0) 520 return i; 521 } 522 return -1; 523} 524 525CU_BOOL suite_ras_tests_enable(void) 526{ 527 amdgpu_device_handle device_handle; 528 uint32_t major_version; 529 uint32_t minor_version; 530 int i; 531 drmDevicePtr device; 532 533 for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) { 534 if (amdgpu_device_initialize(drm_amdgpu[i], &major_version, 535 &minor_version, &device_handle)) 536 continue; 537 538 if (drmGetDevice2(drm_amdgpu[i], 539 DRM_DEVICE_GET_PCI_REVISION, 540 &device)) 541 continue; 542 543 if (device->bustype == DRM_BUS_PCI && 544 amdgpu_ras_lookup_capability(device_handle)) { 545 amdgpu_device_deinitialize(device_handle); 546 return CU_TRUE; 547 } 548 549 if (amdgpu_device_deinitialize(device_handle)) 550 continue; 551 } 552 553 return CU_FALSE; 554} 555 556int suite_ras_tests_init(void) 557{ 558 drmDevicePtr device; 559 amdgpu_device_handle device_handle; 560 uint32_t major_version; 561 uint32_t minor_version; 562 uint32_t capability; 563 struct ras_test_mask test_mask; 564 int id; 565 int i; 566 int r; 567 568 for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) { 569 r = amdgpu_device_initialize(drm_amdgpu[i], &major_version, 570 &minor_version, &device_handle); 571 if (r) 572 continue; 573 574 if (drmGetDevice2(drm_amdgpu[i], 575 DRM_DEVICE_GET_PCI_REVISION, 576 &device)) { 577 amdgpu_device_deinitialize(device_handle); 578 continue; 579 } 580 581 if (device->bustype != DRM_BUS_PCI) { 582 amdgpu_device_deinitialize(device_handle); 583 continue; 584 } 585 586 capability = amdgpu_ras_lookup_capability(device_handle); 587 if (capability == 0) { 588 amdgpu_device_deinitialize(device_handle); 589 continue; 590 591 } 592 593 id = amdgpu_ras_lookup_id(device); 594 if (id == -1) { 595 amdgpu_device_deinitialize(device_handle); 596 continue; 597 } 598 599 test_mask = amdgpu_ras_get_test_mask(device); 600 601 devices[devices_count++] = (struct amdgpu_ras_data) { 602 device_handle, id, capability, test_mask, 603 }; 604 } 605 606 if (devices_count == 0) 607 return CUE_SINIT_FAILED; 608 609 return CUE_SUCCESS; 610} 611 612int suite_ras_tests_clean(void) 613{ 614 int r; 615 int i; 616 int ret = CUE_SUCCESS; 617 618 for (i = 0; i < devices_count; i++) { 619 r = amdgpu_device_deinitialize(devices[i].device_handle); 620 if (r) 621 ret = CUE_SCLEAN_FAILED; 622 } 623 return ret; 624} 625 626static void amdgpu_ras_disable_test(void); 627static void amdgpu_ras_enable_test(void); 628static void amdgpu_ras_inject_test(void); 629static void amdgpu_ras_query_test(void); 630static void amdgpu_ras_basic_test(void); 631 632CU_TestInfo ras_tests[] = { 633 { "ras basic test", amdgpu_ras_basic_test }, 634 { "ras query test", amdgpu_ras_query_test }, 635 { "ras inject test", amdgpu_ras_inject_test }, 636 { "ras disable test", amdgpu_ras_disable_test }, 637#if 0 638 { "ras enable test", amdgpu_ras_enable_test }, 639#endif 640 CU_TEST_INFO_NULL, 641}; 642 643//helpers 644 645static int test_card; 646static char sysfs_path[1024]; 647static char debugfs_path[1024]; 648static uint32_t ras_mask; 649static amdgpu_device_handle device_handle; 650 651static int set_test_card(int card) 652{ 653 int i; 654 655 test_card = card; 656 sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id); 657 sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id); 658 ras_mask = devices[card].capability; 659 device_handle = devices[card].device_handle; 660 ras_block_mask_inject = devices[card].test_mask.inject_mask; 661 ras_block_mask_query = devices[card].test_mask.query_mask; 662 ras_block_mask_basic = devices[card].test_mask.basic_mask; 663 664 return 0; 665} 666 667static const char *get_ras_sysfs_root(void) 668{ 669 return sysfs_path; 670} 671 672static const char *get_ras_debugfs_root(void) 673{ 674 return debugfs_path; 675} 676 677static int set_file_contents(char *file, char *buf, int size) 678{ 679 int n, fd; 680 fd = open(file, O_WRONLY); 681 if (fd == -1) 682 return -1; 683 n = write(fd, buf, size); 684 close(fd); 685 return n; 686} 687 688static int get_file_contents(char *file, char *buf, int size) 689{ 690 int n, fd; 691 fd = open(file, O_RDONLY); 692 if (fd == -1) 693 return -1; 694 n = read(fd, buf, size); 695 close(fd); 696 return n; 697} 698 699static int is_file_ok(char *file, int flags) 700{ 701 int fd; 702 703 fd = open(file, flags); 704 if (fd == -1) 705 return -1; 706 close(fd); 707 return 0; 708} 709 710static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block) 711{ 712 uint32_t feature_mask; 713 int ret; 714 715 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 716 sizeof(feature_mask), &feature_mask); 717 if (ret) 718 return -1; 719 720 return (1 << block) & feature_mask; 721} 722 723static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block) 724{ 725 return (1 << block) & ras_mask; 726} 727 728static int amdgpu_ras_invoke(struct ras_debug_if *data) 729{ 730 char path[1024]; 731 int ret; 732 733 sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl"); 734 735 ret = set_file_contents(path, (char *)data, sizeof(*data)) 736 - sizeof(*data); 737 return ret; 738} 739 740static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block, 741 unsigned long *ue, unsigned long *ce) 742{ 743 char buf[64]; 744 char name[1024]; 745 int ret; 746 747 *ue = *ce = 0; 748 749 if (amdgpu_ras_is_feature_supported(block) <= 0) 750 return -1; 751 752 sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count"); 753 754 if (is_file_ok(name, O_RDONLY)) 755 return 0; 756 757 if (get_file_contents(name, buf, sizeof(buf)) <= 0) 758 return -1; 759 760 if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2) 761 return -1; 762 763 return 0; 764} 765 766static int amdgpu_ras_inject(enum amdgpu_ras_block block, 767 uint32_t sub_block, enum amdgpu_ras_error_type type, 768 uint64_t address, uint64_t value) 769{ 770 struct ras_debug_if data = { .op = 2, }; 771 struct ras_inject_if *inject = &data.inject; 772 int ret; 773 774 if (amdgpu_ras_is_feature_enabled(block) <= 0) { 775 fprintf(stderr, "block id(%d) is not valid\n", block); 776 return -1; 777 } 778 779 inject->head.block = block; 780 inject->head.type = type; 781 inject->head.sub_block_index = sub_block; 782 strncpy(inject->head.name, ras_block_str(block), 32); 783 inject->address = address; 784 inject->value = value; 785 786 ret = amdgpu_ras_invoke(&data); 787 CU_ASSERT_EQUAL(ret, 0); 788 if (ret) 789 return -1; 790 791 return 0; 792} 793 794//tests 795static void amdgpu_ras_features_test(int enable) 796{ 797 struct ras_debug_if data; 798 int ret; 799 int i; 800 801 data.op = enable; 802 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { 803 struct ras_common_if head = { 804 .block = i, 805 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, 806 .sub_block_index = 0, 807 .name = "", 808 }; 809 810 if (amdgpu_ras_is_feature_supported(i) <= 0) 811 continue; 812 813 data.head = head; 814 815 ret = amdgpu_ras_invoke(&data); 816 CU_ASSERT_EQUAL(ret, 0); 817 818 if (ret) 819 continue; 820 821 ret = enable ^ amdgpu_ras_is_feature_enabled(i); 822 CU_ASSERT_EQUAL(ret, 0); 823 } 824} 825 826static void amdgpu_ras_disable_test(void) 827{ 828 int i; 829 for (i = 0; i < devices_count; i++) { 830 set_test_card(i); 831 amdgpu_ras_features_test(0); 832 } 833} 834 835static void amdgpu_ras_enable_test(void) 836{ 837 int i; 838 for (i = 0; i < devices_count; i++) { 839 set_test_card(i); 840 amdgpu_ras_features_test(1); 841 } 842} 843 844static void __amdgpu_ras_ip_inject_test(const struct ras_inject_test_config *ip_test, 845 uint32_t size) 846{ 847 int i, ret; 848 unsigned long old_ue, old_ce; 849 unsigned long ue, ce; 850 uint32_t block; 851 int timeout; 852 bool pass; 853 854 for (i = 0; i < size; i++) { 855 timeout = 3; 856 pass = false; 857 858 block = amdgpu_ras_find_block_id_by_name(ip_test[i].block); 859 860 /* Ensure one valid ip block */ 861 if (block == ARRAY_SIZE(ras_block_string)) 862 break; 863 864 /* Ensure RAS feature for the IP block is enabled by kernel */ 865 if (amdgpu_ras_is_feature_supported(block) <= 0) 866 break; 867 868 ret = amdgpu_ras_query_err_count(block, &old_ue, &old_ce); 869 CU_ASSERT_EQUAL(ret, 0); 870 if (ret) 871 break; 872 873 ret = amdgpu_ras_inject(block, 874 ip_test[i].sub_block, 875 ip_test[i].type, 876 ip_test[i].address, 877 ip_test[i].value); 878 CU_ASSERT_EQUAL(ret, 0); 879 if (ret) 880 break; 881 882 while (timeout > 0) { 883 sleep(5); 884 885 ret = amdgpu_ras_query_err_count(block, &ue, &ce); 886 CU_ASSERT_EQUAL(ret, 0); 887 if (ret) 888 break; 889 890 if (old_ue != ue || old_ce != ce) { 891 pass = true; 892 sleep(20); 893 break; 894 } 895 timeout -= 1; 896 } 897 printf("\t Test %s@block %s, subblock %d, error_type %s, address %ld, value %ld: %s\n", 898 ip_test[i].name, 899 ip_test[i].block, 900 ip_test[i].sub_block, 901 amdgpu_ras_get_error_type_id(ip_test[i].type), 902 ip_test[i].address, 903 ip_test[i].value, 904 pass ? "Pass" : "Fail"); 905 } 906} 907 908static void __amdgpu_ras_inject_test(void) 909{ 910 printf("...\n"); 911 912 /* run UMC ras inject test */ 913 __amdgpu_ras_ip_inject_test(umc_ras_inject_test, 914 ARRAY_SIZE(umc_ras_inject_test)); 915 916 /* run GFX ras inject test */ 917 __amdgpu_ras_ip_inject_test(gfx_ras_inject_test, 918 ARRAY_SIZE(gfx_ras_inject_test)); 919} 920 921static void amdgpu_ras_inject_test(void) 922{ 923 int i; 924 for (i = 0; i < devices_count; i++) { 925 set_test_card(i); 926 __amdgpu_ras_inject_test(); 927 } 928} 929 930static void __amdgpu_ras_query_test(void) 931{ 932 unsigned long ue, ce; 933 int ret; 934 int i; 935 936 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { 937 if (amdgpu_ras_is_feature_supported(i) <= 0) 938 continue; 939 940 if (!((1 << i) & ras_block_mask_query)) 941 continue; 942 943 ret = amdgpu_ras_query_err_count(i, &ue, &ce); 944 CU_ASSERT_EQUAL(ret, 0); 945 } 946} 947 948static void amdgpu_ras_query_test(void) 949{ 950 int i; 951 for (i = 0; i < devices_count; i++) { 952 set_test_card(i); 953 __amdgpu_ras_query_test(); 954 } 955} 956 957static void amdgpu_ras_basic_test(void) 958{ 959 unsigned long ue, ce; 960 char name[1024]; 961 int ret; 962 int i; 963 int j; 964 uint32_t features; 965 char path[1024]; 966 967 ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY); 968 CU_ASSERT_EQUAL(ret, 0); 969 970 for (i = 0; i < devices_count; i++) { 971 set_test_card(i); 972 973 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 974 sizeof(features), &features); 975 CU_ASSERT_EQUAL(ret, 0); 976 977 sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl"); 978 ret = is_file_ok(path, O_WRONLY); 979 CU_ASSERT_EQUAL(ret, 0); 980 981 sprintf(path, "%s%s", get_ras_sysfs_root(), "features"); 982 ret = is_file_ok(path, O_RDONLY); 983 CU_ASSERT_EQUAL(ret, 0); 984 985 for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) { 986 ret = amdgpu_ras_is_feature_supported(j); 987 if (ret <= 0) 988 continue; 989 990 if (!((1 << j) & ras_block_mask_basic)) 991 continue; 992 993 sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count"); 994 ret = is_file_ok(path, O_RDONLY); 995 CU_ASSERT_EQUAL(ret, 0); 996 997 sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject"); 998 ret = is_file_ok(path, O_WRONLY); 999 CU_ASSERT_EQUAL(ret, 0); 1000 } 1001 } 1002} 1003