ras_tests.c revision 5324fb0d
1/*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22*/
23
24#include "CUnit/Basic.h"
25
26#include "amdgpu_test.h"
27#include "amdgpu_drm.h"
28#include "amdgpu_internal.h"
29#include <unistd.h>
30#include <fcntl.h>
31#include <stdio.h>
32#include "xf86drm.h"
33
34const char *ras_block_string[] = {
35	"umc",
36	"sdma",
37	"gfx",
38	"mmhub",
39	"athub",
40	"pcie_bif",
41	"hdp",
42	"xgmi_wafl",
43	"df",
44	"smn",
45	"sem",
46	"mp0",
47	"mp1",
48	"fuse",
49};
50
51#define ras_block_str(i) (ras_block_string[i])
52
53enum amdgpu_ras_block {
54	AMDGPU_RAS_BLOCK__UMC = 0,
55	AMDGPU_RAS_BLOCK__SDMA,
56	AMDGPU_RAS_BLOCK__GFX,
57	AMDGPU_RAS_BLOCK__MMHUB,
58	AMDGPU_RAS_BLOCK__ATHUB,
59	AMDGPU_RAS_BLOCK__PCIE_BIF,
60	AMDGPU_RAS_BLOCK__HDP,
61	AMDGPU_RAS_BLOCK__XGMI_WAFL,
62	AMDGPU_RAS_BLOCK__DF,
63	AMDGPU_RAS_BLOCK__SMN,
64	AMDGPU_RAS_BLOCK__SEM,
65	AMDGPU_RAS_BLOCK__MP0,
66	AMDGPU_RAS_BLOCK__MP1,
67	AMDGPU_RAS_BLOCK__FUSE,
68
69	AMDGPU_RAS_BLOCK__LAST
70};
71
72#define AMDGPU_RAS_BLOCK_COUNT  AMDGPU_RAS_BLOCK__LAST
73#define AMDGPU_RAS_BLOCK_MASK   ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
74
75enum amdgpu_ras_error_type {
76	AMDGPU_RAS_ERROR__NONE				= 0,
77	AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE		= 2,
78	AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE		= 4,
79	AMDGPU_RAS_ERROR__POISON			= 8,
80};
81
82struct ras_common_if {
83	enum amdgpu_ras_block block;
84	enum amdgpu_ras_error_type type;
85	uint32_t sub_block_index;
86	char name[32];
87};
88
89struct ras_inject_if {
90	struct ras_common_if head;
91	uint64_t address;
92	uint64_t value;
93};
94
95struct ras_debug_if {
96	union {
97		struct ras_common_if head;
98		struct ras_inject_if inject;
99	};
100	int op;
101};
102/* for now, only umc, gfx, sdma has implemented. */
103#define DEFAULT_RAS_BLOCK_MASK_INJECT (1 << AMDGPU_RAS_BLOCK__UMC)
104#define DEFAULT_RAS_BLOCK_MASK_QUERY (1 << AMDGPU_RAS_BLOCK__UMC)
105#define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\
106		(1 << AMDGPU_RAS_BLOCK__SDMA) |\
107		(1 << AMDGPU_RAS_BLOCK__GFX))
108
109static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT;
110static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT;
111static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC;
112
113struct ras_test_mask {
114	uint32_t inject_mask;
115	uint32_t query_mask;
116	uint32_t basic_mask;
117};
118
119struct amdgpu_ras_data {
120	amdgpu_device_handle device_handle;
121	uint32_t  id;
122	uint32_t  capability;
123	struct ras_test_mask test_mask;
124};
125
126/* all devices who has ras supported */
127static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED];
128static int devices_count;
129
130struct ras_DID_test_mask{
131	uint16_t device_id;
132	uint16_t revision_id;
133	struct ras_test_mask test_mask;
134};
135
136/* white list for inject test. */
137#define RAS_BLOCK_MASK_ALL {\
138	DEFAULT_RAS_BLOCK_MASK_INJECT,\
139	DEFAULT_RAS_BLOCK_MASK_QUERY,\
140	DEFAULT_RAS_BLOCK_MASK_BASIC\
141}
142
143#define RAS_BLOCK_MASK_QUERY_BASIC {\
144	0,\
145	DEFAULT_RAS_BLOCK_MASK_QUERY,\
146	DEFAULT_RAS_BLOCK_MASK_BASIC\
147}
148
149static const struct ras_DID_test_mask ras_DID_array[] = {
150	{0x66a1, 0x00, RAS_BLOCK_MASK_ALL},
151	{0x66a1, 0x01, RAS_BLOCK_MASK_ALL},
152	{0x66a1, 0x04, RAS_BLOCK_MASK_ALL},
153};
154
155static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device)
156{
157	int i;
158	static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC;
159
160	for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) {
161		if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id &&
162				ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id)
163			return ras_DID_array[i].test_mask;
164	}
165	return default_test_mask;
166}
167
168static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle)
169{
170	union {
171		uint64_t feature_mask;
172		struct {
173			uint32_t enabled_features;
174			uint32_t supported_features;
175		};
176	} features = { 0 };
177	int ret;
178
179	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
180			sizeof(features), &features);
181	if (ret)
182		return 0;
183
184	return features.supported_features;
185}
186
187static int get_file_contents(char *file, char *buf, int size);
188
189static int amdgpu_ras_lookup_id(drmDevicePtr device)
190{
191	char path[1024];
192	char str[128];
193	drmPciBusInfo info;
194	int i;
195	int ret;
196
197	for (i = 0; i < MAX_CARDS_SUPPORTED; i++) {
198		memset(str, 0, sizeof(str));
199		memset(&info, 0, sizeof(info));
200		sprintf(path, "/sys/kernel/debug/dri/%d/name", i);
201		if (get_file_contents(path, str, sizeof(str)) <= 0)
202			continue;
203
204		ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx",
205				&info.domain, &info.bus, &info.dev, &info.func);
206		if (ret != 4)
207			continue;
208
209		if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0)
210				return i;
211	}
212	return -1;
213}
214
215CU_BOOL suite_ras_tests_enable(void)
216{
217	amdgpu_device_handle device_handle;
218	uint32_t  major_version;
219	uint32_t  minor_version;
220	int i;
221	drmDevicePtr device;
222
223	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
224		if (amdgpu_device_initialize(drm_amdgpu[i], &major_version,
225					&minor_version, &device_handle))
226			continue;
227
228		if (drmGetDevice2(drm_amdgpu[i],
229					DRM_DEVICE_GET_PCI_REVISION,
230					&device))
231			continue;
232
233		if (device->bustype == DRM_BUS_PCI &&
234				amdgpu_ras_lookup_capability(device_handle)) {
235			amdgpu_device_deinitialize(device_handle);
236			return CU_TRUE;
237		}
238
239		if (amdgpu_device_deinitialize(device_handle))
240			continue;
241	}
242
243	return CU_FALSE;
244}
245
246int suite_ras_tests_init(void)
247{
248	drmDevicePtr device;
249	amdgpu_device_handle device_handle;
250	uint32_t  major_version;
251	uint32_t  minor_version;
252	uint32_t  capability;
253	struct ras_test_mask test_mask;
254	int id;
255	int i;
256	int r;
257
258	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
259		r = amdgpu_device_initialize(drm_amdgpu[i], &major_version,
260				&minor_version, &device_handle);
261		if (r)
262			continue;
263
264		if (drmGetDevice2(drm_amdgpu[i],
265					DRM_DEVICE_GET_PCI_REVISION,
266					&device)) {
267			amdgpu_device_deinitialize(device_handle);
268			continue;
269		}
270
271		if (device->bustype != DRM_BUS_PCI) {
272			amdgpu_device_deinitialize(device_handle);
273			continue;
274		}
275
276		capability = amdgpu_ras_lookup_capability(device_handle);
277		if (capability == 0) {
278			amdgpu_device_deinitialize(device_handle);
279			continue;
280
281		}
282
283		id = amdgpu_ras_lookup_id(device);
284		if (id == -1) {
285			amdgpu_device_deinitialize(device_handle);
286			continue;
287		}
288
289		test_mask = amdgpu_ras_get_test_mask(device);
290
291		devices[devices_count++] = (struct amdgpu_ras_data) {
292			device_handle, id, capability, test_mask,
293		};
294	}
295
296	if (devices_count == 0)
297		return CUE_SINIT_FAILED;
298
299	return CUE_SUCCESS;
300}
301
302int suite_ras_tests_clean(void)
303{
304	int r;
305	int i;
306	int ret = CUE_SUCCESS;
307
308	for (i = 0; i < devices_count; i++) {
309		r = amdgpu_device_deinitialize(devices[i].device_handle);
310		if (r)
311			ret = CUE_SCLEAN_FAILED;
312	}
313	return ret;
314}
315
316static void amdgpu_ras_disable_test(void);
317static void amdgpu_ras_enable_test(void);
318static void amdgpu_ras_inject_test(void);
319static void amdgpu_ras_query_test(void);
320static void amdgpu_ras_basic_test(void);
321
322CU_TestInfo ras_tests[] = {
323	{ "ras basic test",	amdgpu_ras_basic_test },
324	{ "ras query test",	amdgpu_ras_query_test },
325	{ "ras inject test",	amdgpu_ras_inject_test },
326	{ "ras disable test",	amdgpu_ras_disable_test },
327#if 0
328	{ "ras enable test",	amdgpu_ras_enable_test },
329#endif
330	CU_TEST_INFO_NULL,
331};
332
333//helpers
334
335static int test_card;
336static char sysfs_path[1024];
337static char debugfs_path[1024];
338static uint32_t ras_mask;
339static amdgpu_device_handle device_handle;
340
341static int set_test_card(int card)
342{
343	int i;
344
345	test_card = card;
346	sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id);
347	sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id);
348	ras_mask = devices[card].capability;
349	device_handle = devices[card].device_handle;
350	ras_block_mask_inject = devices[card].test_mask.inject_mask;
351	ras_block_mask_query = devices[card].test_mask.query_mask;
352	ras_block_mask_basic = devices[card].test_mask.basic_mask;
353
354	return 0;
355}
356
357static const char *get_ras_sysfs_root(void)
358{
359	return sysfs_path;
360}
361
362static const char *get_ras_debugfs_root(void)
363{
364	return debugfs_path;
365}
366
367static int set_file_contents(char *file, char *buf, int size)
368{
369	int n, fd;
370	fd = open(file, O_WRONLY);
371	if (fd == -1)
372		return -1;
373	n = write(fd, buf, size);
374	close(fd);
375	return n;
376}
377
378static int get_file_contents(char *file, char *buf, int size)
379{
380	int n, fd;
381	fd = open(file, O_RDONLY);
382	if (fd == -1)
383		return -1;
384	n = read(fd, buf, size);
385	close(fd);
386	return n;
387}
388
389static int is_file_ok(char *file, int flags)
390{
391	int fd;
392
393	fd = open(file, flags);
394	if (fd == -1)
395		return -1;
396	close(fd);
397	return 0;
398}
399
400static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block)
401{
402	uint32_t feature_mask;
403	int ret;
404
405	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
406			sizeof(feature_mask), &feature_mask);
407	if (ret)
408		return -1;
409
410	return (1 << block) & feature_mask;
411}
412
413static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block)
414{
415	return (1 << block) & ras_mask;
416}
417
418static int amdgpu_ras_invoke(struct ras_debug_if *data)
419{
420	char path[1024];
421	int ret;
422
423	sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
424
425	ret = set_file_contents(path, (char *)data, sizeof(*data))
426		- sizeof(*data);
427	return ret;
428}
429
430static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block,
431		unsigned long *ue, unsigned long *ce)
432{
433	char buf[64];
434	char name[1024];
435	int ret;
436
437	*ue = *ce = 0;
438
439	if (amdgpu_ras_is_feature_supported(block) <= 0)
440		return -1;
441
442	sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count");
443
444	if (is_file_ok(name, O_RDONLY))
445		return 0;
446
447	if (get_file_contents(name, buf, sizeof(buf)) <= 0)
448		return -1;
449
450	if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2)
451		return -1;
452
453	return 0;
454}
455
456//tests
457static void amdgpu_ras_features_test(int enable)
458{
459	struct ras_debug_if data;
460	int ret;
461	int i;
462
463	data.op = enable;
464	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
465		struct ras_common_if head = {
466			.block = i,
467			.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
468			.sub_block_index = 0,
469			.name = "",
470		};
471
472		if (amdgpu_ras_is_feature_supported(i) <= 0)
473			continue;
474
475		data.head = head;
476
477		ret = amdgpu_ras_invoke(&data);
478		CU_ASSERT_EQUAL(ret, 0);
479
480		if (ret)
481			continue;
482
483		ret = enable ^ amdgpu_ras_is_feature_enabled(i);
484		CU_ASSERT_EQUAL(ret, 0);
485	}
486}
487
488static void amdgpu_ras_disable_test(void)
489{
490	int i;
491	for (i = 0; i < devices_count; i++) {
492		set_test_card(i);
493		amdgpu_ras_features_test(0);
494	}
495}
496
497static void amdgpu_ras_enable_test(void)
498{
499	int i;
500	for (i = 0; i < devices_count; i++) {
501		set_test_card(i);
502		amdgpu_ras_features_test(1);
503	}
504}
505
506static void __amdgpu_ras_inject_test(void)
507{
508	struct ras_debug_if data;
509	int ret;
510	int i;
511	unsigned long ue, ce, ue_old, ce_old;
512
513	data.op = 2;
514	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
515		int timeout = 3;
516		struct ras_inject_if inject = {
517			.head = {
518				.block = i,
519				.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
520				.sub_block_index = 0,
521				.name = "",
522			},
523			.address = 0,
524			.value = 0,
525		};
526
527		if (amdgpu_ras_is_feature_enabled(i) <= 0)
528			continue;
529
530		if (!((1 << i) & ras_block_mask_inject))
531			continue;
532
533		data.inject = inject;
534
535		ret = amdgpu_ras_query_err_count(i, &ue_old, &ce_old);
536		CU_ASSERT_EQUAL(ret, 0);
537
538		if (ret)
539			continue;
540
541		ret = amdgpu_ras_invoke(&data);
542		CU_ASSERT_EQUAL(ret, 0);
543
544		if (ret)
545			continue;
546
547loop:
548		while (timeout > 0) {
549			ret = amdgpu_ras_query_err_count(i, &ue, &ce);
550			CU_ASSERT_EQUAL(ret, 0);
551
552			if (ret)
553				continue;
554			if (ue_old != ue) {
555				/*recovery takes ~10s*/
556				sleep(10);
557				break;
558			}
559
560			sleep(1);
561			timeout -= 1;
562		}
563
564		CU_ASSERT_EQUAL(ue_old + 1, ue);
565		CU_ASSERT_EQUAL(ce_old, ce);
566	}
567}
568
569static void amdgpu_ras_inject_test(void)
570{
571	int i;
572	for (i = 0; i < devices_count; i++) {
573		set_test_card(i);
574		__amdgpu_ras_inject_test();
575	}
576}
577
578static void __amdgpu_ras_query_test(void)
579{
580	unsigned long ue, ce;
581	int ret;
582	int i;
583
584	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
585		if (amdgpu_ras_is_feature_supported(i) <= 0)
586			continue;
587
588		if (!((1 << i) & ras_block_mask_query))
589			continue;
590
591		ret = amdgpu_ras_query_err_count(i, &ue, &ce);
592		CU_ASSERT_EQUAL(ret, 0);
593	}
594}
595
596static void amdgpu_ras_query_test(void)
597{
598	int i;
599	for (i = 0; i < devices_count; i++) {
600		set_test_card(i);
601		__amdgpu_ras_query_test();
602	}
603}
604
605static void amdgpu_ras_basic_test(void)
606{
607	unsigned long ue, ce;
608	char name[1024];
609	int ret;
610	int i;
611	int j;
612	uint32_t features;
613	char path[1024];
614
615	ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY);
616	CU_ASSERT_EQUAL(ret, 0);
617
618	for (i = 0; i < devices_count; i++) {
619		set_test_card(i);
620
621		ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
622				sizeof(features), &features);
623		CU_ASSERT_EQUAL(ret, 0);
624
625		sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
626		ret = is_file_ok(path, O_WRONLY);
627		CU_ASSERT_EQUAL(ret, 0);
628
629		sprintf(path, "%s%s", get_ras_sysfs_root(), "features");
630		ret = is_file_ok(path, O_RDONLY);
631		CU_ASSERT_EQUAL(ret, 0);
632
633		for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) {
634			ret = amdgpu_ras_is_feature_supported(j);
635			if (ret <= 0)
636				continue;
637
638			if (!((1 << j) & ras_block_mask_basic))
639				continue;
640
641			sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count");
642			ret = is_file_ok(path, O_RDONLY);
643			CU_ASSERT_EQUAL(ret, 0);
644
645			sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject");
646			ret = is_file_ok(path, O_WRONLY);
647			CU_ASSERT_EQUAL(ret, 0);
648		}
649	}
650}
651