ras_tests.c revision 5324fb0d
1/* 2 * Copyright 2017 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22*/ 23 24#include "CUnit/Basic.h" 25 26#include "amdgpu_test.h" 27#include "amdgpu_drm.h" 28#include "amdgpu_internal.h" 29#include <unistd.h> 30#include <fcntl.h> 31#include <stdio.h> 32#include "xf86drm.h" 33 34const char *ras_block_string[] = { 35 "umc", 36 "sdma", 37 "gfx", 38 "mmhub", 39 "athub", 40 "pcie_bif", 41 "hdp", 42 "xgmi_wafl", 43 "df", 44 "smn", 45 "sem", 46 "mp0", 47 "mp1", 48 "fuse", 49}; 50 51#define ras_block_str(i) (ras_block_string[i]) 52 53enum amdgpu_ras_block { 54 AMDGPU_RAS_BLOCK__UMC = 0, 55 AMDGPU_RAS_BLOCK__SDMA, 56 AMDGPU_RAS_BLOCK__GFX, 57 AMDGPU_RAS_BLOCK__MMHUB, 58 AMDGPU_RAS_BLOCK__ATHUB, 59 AMDGPU_RAS_BLOCK__PCIE_BIF, 60 AMDGPU_RAS_BLOCK__HDP, 61 AMDGPU_RAS_BLOCK__XGMI_WAFL, 62 AMDGPU_RAS_BLOCK__DF, 63 AMDGPU_RAS_BLOCK__SMN, 64 AMDGPU_RAS_BLOCK__SEM, 65 AMDGPU_RAS_BLOCK__MP0, 66 AMDGPU_RAS_BLOCK__MP1, 67 AMDGPU_RAS_BLOCK__FUSE, 68 69 AMDGPU_RAS_BLOCK__LAST 70}; 71 72#define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST 73#define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) 74 75enum amdgpu_ras_error_type { 76 AMDGPU_RAS_ERROR__NONE = 0, 77 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2, 78 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4, 79 AMDGPU_RAS_ERROR__POISON = 8, 80}; 81 82struct ras_common_if { 83 enum amdgpu_ras_block block; 84 enum amdgpu_ras_error_type type; 85 uint32_t sub_block_index; 86 char name[32]; 87}; 88 89struct ras_inject_if { 90 struct ras_common_if head; 91 uint64_t address; 92 uint64_t value; 93}; 94 95struct ras_debug_if { 96 union { 97 struct ras_common_if head; 98 struct ras_inject_if inject; 99 }; 100 int op; 101}; 102/* for now, only umc, gfx, sdma has implemented. */ 103#define DEFAULT_RAS_BLOCK_MASK_INJECT (1 << AMDGPU_RAS_BLOCK__UMC) 104#define DEFAULT_RAS_BLOCK_MASK_QUERY (1 << AMDGPU_RAS_BLOCK__UMC) 105#define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\ 106 (1 << AMDGPU_RAS_BLOCK__SDMA) |\ 107 (1 << AMDGPU_RAS_BLOCK__GFX)) 108 109static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT; 110static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT; 111static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC; 112 113struct ras_test_mask { 114 uint32_t inject_mask; 115 uint32_t query_mask; 116 uint32_t basic_mask; 117}; 118 119struct amdgpu_ras_data { 120 amdgpu_device_handle device_handle; 121 uint32_t id; 122 uint32_t capability; 123 struct ras_test_mask test_mask; 124}; 125 126/* all devices who has ras supported */ 127static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED]; 128static int devices_count; 129 130struct ras_DID_test_mask{ 131 uint16_t device_id; 132 uint16_t revision_id; 133 struct ras_test_mask test_mask; 134}; 135 136/* white list for inject test. */ 137#define RAS_BLOCK_MASK_ALL {\ 138 DEFAULT_RAS_BLOCK_MASK_INJECT,\ 139 DEFAULT_RAS_BLOCK_MASK_QUERY,\ 140 DEFAULT_RAS_BLOCK_MASK_BASIC\ 141} 142 143#define RAS_BLOCK_MASK_QUERY_BASIC {\ 144 0,\ 145 DEFAULT_RAS_BLOCK_MASK_QUERY,\ 146 DEFAULT_RAS_BLOCK_MASK_BASIC\ 147} 148 149static const struct ras_DID_test_mask ras_DID_array[] = { 150 {0x66a1, 0x00, RAS_BLOCK_MASK_ALL}, 151 {0x66a1, 0x01, RAS_BLOCK_MASK_ALL}, 152 {0x66a1, 0x04, RAS_BLOCK_MASK_ALL}, 153}; 154 155static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device) 156{ 157 int i; 158 static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC; 159 160 for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) { 161 if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id && 162 ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id) 163 return ras_DID_array[i].test_mask; 164 } 165 return default_test_mask; 166} 167 168static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle) 169{ 170 union { 171 uint64_t feature_mask; 172 struct { 173 uint32_t enabled_features; 174 uint32_t supported_features; 175 }; 176 } features = { 0 }; 177 int ret; 178 179 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 180 sizeof(features), &features); 181 if (ret) 182 return 0; 183 184 return features.supported_features; 185} 186 187static int get_file_contents(char *file, char *buf, int size); 188 189static int amdgpu_ras_lookup_id(drmDevicePtr device) 190{ 191 char path[1024]; 192 char str[128]; 193 drmPciBusInfo info; 194 int i; 195 int ret; 196 197 for (i = 0; i < MAX_CARDS_SUPPORTED; i++) { 198 memset(str, 0, sizeof(str)); 199 memset(&info, 0, sizeof(info)); 200 sprintf(path, "/sys/kernel/debug/dri/%d/name", i); 201 if (get_file_contents(path, str, sizeof(str)) <= 0) 202 continue; 203 204 ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx", 205 &info.domain, &info.bus, &info.dev, &info.func); 206 if (ret != 4) 207 continue; 208 209 if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0) 210 return i; 211 } 212 return -1; 213} 214 215CU_BOOL suite_ras_tests_enable(void) 216{ 217 amdgpu_device_handle device_handle; 218 uint32_t major_version; 219 uint32_t minor_version; 220 int i; 221 drmDevicePtr device; 222 223 for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) { 224 if (amdgpu_device_initialize(drm_amdgpu[i], &major_version, 225 &minor_version, &device_handle)) 226 continue; 227 228 if (drmGetDevice2(drm_amdgpu[i], 229 DRM_DEVICE_GET_PCI_REVISION, 230 &device)) 231 continue; 232 233 if (device->bustype == DRM_BUS_PCI && 234 amdgpu_ras_lookup_capability(device_handle)) { 235 amdgpu_device_deinitialize(device_handle); 236 return CU_TRUE; 237 } 238 239 if (amdgpu_device_deinitialize(device_handle)) 240 continue; 241 } 242 243 return CU_FALSE; 244} 245 246int suite_ras_tests_init(void) 247{ 248 drmDevicePtr device; 249 amdgpu_device_handle device_handle; 250 uint32_t major_version; 251 uint32_t minor_version; 252 uint32_t capability; 253 struct ras_test_mask test_mask; 254 int id; 255 int i; 256 int r; 257 258 for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) { 259 r = amdgpu_device_initialize(drm_amdgpu[i], &major_version, 260 &minor_version, &device_handle); 261 if (r) 262 continue; 263 264 if (drmGetDevice2(drm_amdgpu[i], 265 DRM_DEVICE_GET_PCI_REVISION, 266 &device)) { 267 amdgpu_device_deinitialize(device_handle); 268 continue; 269 } 270 271 if (device->bustype != DRM_BUS_PCI) { 272 amdgpu_device_deinitialize(device_handle); 273 continue; 274 } 275 276 capability = amdgpu_ras_lookup_capability(device_handle); 277 if (capability == 0) { 278 amdgpu_device_deinitialize(device_handle); 279 continue; 280 281 } 282 283 id = amdgpu_ras_lookup_id(device); 284 if (id == -1) { 285 amdgpu_device_deinitialize(device_handle); 286 continue; 287 } 288 289 test_mask = amdgpu_ras_get_test_mask(device); 290 291 devices[devices_count++] = (struct amdgpu_ras_data) { 292 device_handle, id, capability, test_mask, 293 }; 294 } 295 296 if (devices_count == 0) 297 return CUE_SINIT_FAILED; 298 299 return CUE_SUCCESS; 300} 301 302int suite_ras_tests_clean(void) 303{ 304 int r; 305 int i; 306 int ret = CUE_SUCCESS; 307 308 for (i = 0; i < devices_count; i++) { 309 r = amdgpu_device_deinitialize(devices[i].device_handle); 310 if (r) 311 ret = CUE_SCLEAN_FAILED; 312 } 313 return ret; 314} 315 316static void amdgpu_ras_disable_test(void); 317static void amdgpu_ras_enable_test(void); 318static void amdgpu_ras_inject_test(void); 319static void amdgpu_ras_query_test(void); 320static void amdgpu_ras_basic_test(void); 321 322CU_TestInfo ras_tests[] = { 323 { "ras basic test", amdgpu_ras_basic_test }, 324 { "ras query test", amdgpu_ras_query_test }, 325 { "ras inject test", amdgpu_ras_inject_test }, 326 { "ras disable test", amdgpu_ras_disable_test }, 327#if 0 328 { "ras enable test", amdgpu_ras_enable_test }, 329#endif 330 CU_TEST_INFO_NULL, 331}; 332 333//helpers 334 335static int test_card; 336static char sysfs_path[1024]; 337static char debugfs_path[1024]; 338static uint32_t ras_mask; 339static amdgpu_device_handle device_handle; 340 341static int set_test_card(int card) 342{ 343 int i; 344 345 test_card = card; 346 sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id); 347 sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id); 348 ras_mask = devices[card].capability; 349 device_handle = devices[card].device_handle; 350 ras_block_mask_inject = devices[card].test_mask.inject_mask; 351 ras_block_mask_query = devices[card].test_mask.query_mask; 352 ras_block_mask_basic = devices[card].test_mask.basic_mask; 353 354 return 0; 355} 356 357static const char *get_ras_sysfs_root(void) 358{ 359 return sysfs_path; 360} 361 362static const char *get_ras_debugfs_root(void) 363{ 364 return debugfs_path; 365} 366 367static int set_file_contents(char *file, char *buf, int size) 368{ 369 int n, fd; 370 fd = open(file, O_WRONLY); 371 if (fd == -1) 372 return -1; 373 n = write(fd, buf, size); 374 close(fd); 375 return n; 376} 377 378static int get_file_contents(char *file, char *buf, int size) 379{ 380 int n, fd; 381 fd = open(file, O_RDONLY); 382 if (fd == -1) 383 return -1; 384 n = read(fd, buf, size); 385 close(fd); 386 return n; 387} 388 389static int is_file_ok(char *file, int flags) 390{ 391 int fd; 392 393 fd = open(file, flags); 394 if (fd == -1) 395 return -1; 396 close(fd); 397 return 0; 398} 399 400static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block) 401{ 402 uint32_t feature_mask; 403 int ret; 404 405 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 406 sizeof(feature_mask), &feature_mask); 407 if (ret) 408 return -1; 409 410 return (1 << block) & feature_mask; 411} 412 413static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block) 414{ 415 return (1 << block) & ras_mask; 416} 417 418static int amdgpu_ras_invoke(struct ras_debug_if *data) 419{ 420 char path[1024]; 421 int ret; 422 423 sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl"); 424 425 ret = set_file_contents(path, (char *)data, sizeof(*data)) 426 - sizeof(*data); 427 return ret; 428} 429 430static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block, 431 unsigned long *ue, unsigned long *ce) 432{ 433 char buf[64]; 434 char name[1024]; 435 int ret; 436 437 *ue = *ce = 0; 438 439 if (amdgpu_ras_is_feature_supported(block) <= 0) 440 return -1; 441 442 sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count"); 443 444 if (is_file_ok(name, O_RDONLY)) 445 return 0; 446 447 if (get_file_contents(name, buf, sizeof(buf)) <= 0) 448 return -1; 449 450 if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2) 451 return -1; 452 453 return 0; 454} 455 456//tests 457static void amdgpu_ras_features_test(int enable) 458{ 459 struct ras_debug_if data; 460 int ret; 461 int i; 462 463 data.op = enable; 464 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { 465 struct ras_common_if head = { 466 .block = i, 467 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, 468 .sub_block_index = 0, 469 .name = "", 470 }; 471 472 if (amdgpu_ras_is_feature_supported(i) <= 0) 473 continue; 474 475 data.head = head; 476 477 ret = amdgpu_ras_invoke(&data); 478 CU_ASSERT_EQUAL(ret, 0); 479 480 if (ret) 481 continue; 482 483 ret = enable ^ amdgpu_ras_is_feature_enabled(i); 484 CU_ASSERT_EQUAL(ret, 0); 485 } 486} 487 488static void amdgpu_ras_disable_test(void) 489{ 490 int i; 491 for (i = 0; i < devices_count; i++) { 492 set_test_card(i); 493 amdgpu_ras_features_test(0); 494 } 495} 496 497static void amdgpu_ras_enable_test(void) 498{ 499 int i; 500 for (i = 0; i < devices_count; i++) { 501 set_test_card(i); 502 amdgpu_ras_features_test(1); 503 } 504} 505 506static void __amdgpu_ras_inject_test(void) 507{ 508 struct ras_debug_if data; 509 int ret; 510 int i; 511 unsigned long ue, ce, ue_old, ce_old; 512 513 data.op = 2; 514 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { 515 int timeout = 3; 516 struct ras_inject_if inject = { 517 .head = { 518 .block = i, 519 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, 520 .sub_block_index = 0, 521 .name = "", 522 }, 523 .address = 0, 524 .value = 0, 525 }; 526 527 if (amdgpu_ras_is_feature_enabled(i) <= 0) 528 continue; 529 530 if (!((1 << i) & ras_block_mask_inject)) 531 continue; 532 533 data.inject = inject; 534 535 ret = amdgpu_ras_query_err_count(i, &ue_old, &ce_old); 536 CU_ASSERT_EQUAL(ret, 0); 537 538 if (ret) 539 continue; 540 541 ret = amdgpu_ras_invoke(&data); 542 CU_ASSERT_EQUAL(ret, 0); 543 544 if (ret) 545 continue; 546 547loop: 548 while (timeout > 0) { 549 ret = amdgpu_ras_query_err_count(i, &ue, &ce); 550 CU_ASSERT_EQUAL(ret, 0); 551 552 if (ret) 553 continue; 554 if (ue_old != ue) { 555 /*recovery takes ~10s*/ 556 sleep(10); 557 break; 558 } 559 560 sleep(1); 561 timeout -= 1; 562 } 563 564 CU_ASSERT_EQUAL(ue_old + 1, ue); 565 CU_ASSERT_EQUAL(ce_old, ce); 566 } 567} 568 569static void amdgpu_ras_inject_test(void) 570{ 571 int i; 572 for (i = 0; i < devices_count; i++) { 573 set_test_card(i); 574 __amdgpu_ras_inject_test(); 575 } 576} 577 578static void __amdgpu_ras_query_test(void) 579{ 580 unsigned long ue, ce; 581 int ret; 582 int i; 583 584 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { 585 if (amdgpu_ras_is_feature_supported(i) <= 0) 586 continue; 587 588 if (!((1 << i) & ras_block_mask_query)) 589 continue; 590 591 ret = amdgpu_ras_query_err_count(i, &ue, &ce); 592 CU_ASSERT_EQUAL(ret, 0); 593 } 594} 595 596static void amdgpu_ras_query_test(void) 597{ 598 int i; 599 for (i = 0; i < devices_count; i++) { 600 set_test_card(i); 601 __amdgpu_ras_query_test(); 602 } 603} 604 605static void amdgpu_ras_basic_test(void) 606{ 607 unsigned long ue, ce; 608 char name[1024]; 609 int ret; 610 int i; 611 int j; 612 uint32_t features; 613 char path[1024]; 614 615 ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY); 616 CU_ASSERT_EQUAL(ret, 0); 617 618 for (i = 0; i < devices_count; i++) { 619 set_test_card(i); 620 621 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 622 sizeof(features), &features); 623 CU_ASSERT_EQUAL(ret, 0); 624 625 sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl"); 626 ret = is_file_ok(path, O_WRONLY); 627 CU_ASSERT_EQUAL(ret, 0); 628 629 sprintf(path, "%s%s", get_ras_sysfs_root(), "features"); 630 ret = is_file_ok(path, O_RDONLY); 631 CU_ASSERT_EQUAL(ret, 0); 632 633 for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) { 634 ret = amdgpu_ras_is_feature_supported(j); 635 if (ret <= 0) 636 continue; 637 638 if (!((1 << j) & ras_block_mask_basic)) 639 continue; 640 641 sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count"); 642 ret = is_file_ok(path, O_RDONLY); 643 CU_ASSERT_EQUAL(ret, 0); 644 645 sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject"); 646 ret = is_file_ok(path, O_WRONLY); 647 CU_ASSERT_EQUAL(ret, 0); 648 } 649 } 650} 651