ras_tests.c revision 88f8a8d2
1/*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22*/
23
24#include "CUnit/Basic.h"
25
26#include "amdgpu_test.h"
27#include "amdgpu_drm.h"
28#include "amdgpu_internal.h"
29#include <unistd.h>
30#include <fcntl.h>
31#include <stdio.h>
32#include "xf86drm.h"
33
34#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
35
36const char *ras_block_string[] = {
37	"umc",
38	"sdma",
39	"gfx",
40	"mmhub",
41	"athub",
42	"pcie_bif",
43	"hdp",
44	"xgmi_wafl",
45	"df",
46	"smn",
47	"sem",
48	"mp0",
49	"mp1",
50	"fuse",
51};
52
53#define ras_block_str(i) (ras_block_string[i])
54
55enum amdgpu_ras_block {
56	AMDGPU_RAS_BLOCK__UMC = 0,
57	AMDGPU_RAS_BLOCK__SDMA,
58	AMDGPU_RAS_BLOCK__GFX,
59	AMDGPU_RAS_BLOCK__MMHUB,
60	AMDGPU_RAS_BLOCK__ATHUB,
61	AMDGPU_RAS_BLOCK__PCIE_BIF,
62	AMDGPU_RAS_BLOCK__HDP,
63	AMDGPU_RAS_BLOCK__XGMI_WAFL,
64	AMDGPU_RAS_BLOCK__DF,
65	AMDGPU_RAS_BLOCK__SMN,
66	AMDGPU_RAS_BLOCK__SEM,
67	AMDGPU_RAS_BLOCK__MP0,
68	AMDGPU_RAS_BLOCK__MP1,
69	AMDGPU_RAS_BLOCK__FUSE,
70
71	AMDGPU_RAS_BLOCK__LAST
72};
73
74#define AMDGPU_RAS_BLOCK_COUNT  AMDGPU_RAS_BLOCK__LAST
75#define AMDGPU_RAS_BLOCK_MASK   ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
76
77enum amdgpu_ras_gfx_subblock {
78	/* CPC */
79	AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START = 0,
80	AMDGPU_RAS_BLOCK__GFX_CPC_SCRATCH =
81		AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START,
82	AMDGPU_RAS_BLOCK__GFX_CPC_UCODE,
83	AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME1,
84	AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME1,
85	AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME1,
86	AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME2,
87	AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME2,
88	AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2,
89	AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_END =
90		AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2,
91	/* CPF */
92	AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START,
93	AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME2 =
94		AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START,
95	AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME1,
96	AMDGPU_RAS_BLOCK__GFX_CPF_TAG,
97	AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPF_TAG,
98	/* CPG */
99	AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START,
100	AMDGPU_RAS_BLOCK__GFX_CPG_DMA_ROQ =
101		AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START,
102	AMDGPU_RAS_BLOCK__GFX_CPG_DMA_TAG,
103	AMDGPU_RAS_BLOCK__GFX_CPG_TAG,
104	AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPG_TAG,
105	/* GDS */
106	AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START,
107	AMDGPU_RAS_BLOCK__GFX_GDS_MEM = AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START,
108	AMDGPU_RAS_BLOCK__GFX_GDS_INPUT_QUEUE,
109	AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_CMD_RAM_MEM,
110	AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_DATA_RAM_MEM,
111	AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM,
112	AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_END =
113		AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM,
114	/* SPI */
115	AMDGPU_RAS_BLOCK__GFX_SPI_SR_MEM,
116	/* SQ */
117	AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START,
118	AMDGPU_RAS_BLOCK__GFX_SQ_SGPR = AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START,
119	AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D,
120	AMDGPU_RAS_BLOCK__GFX_SQ_LDS_I,
121	AMDGPU_RAS_BLOCK__GFX_SQ_VGPR,
122	AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_END = AMDGPU_RAS_BLOCK__GFX_SQ_VGPR,
123	/* SQC (3 ranges) */
124	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START,
125	/* SQC range 0 */
126	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START =
127		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START,
128	AMDGPU_RAS_BLOCK__GFX_SQC_INST_UTCL1_LFIFO =
129		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START,
130	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_WRITE_DATA_BUF,
131	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_UTCL1_LFIFO,
132	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_WRITE_DATA_BUF,
133	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO,
134	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_WRITE_DATA_BUF,
135	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO,
136	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_END =
137		AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO,
138	/* SQC range 1 */
139	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START,
140	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM =
141		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START,
142	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO,
143	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_MISS_FIFO,
144	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_BANK_RAM,
145	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_TAG_RAM,
146	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_HIT_FIFO,
147	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_MISS_FIFO,
148	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM,
149	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM,
150	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_END =
151		AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM,
152	/* SQC range 2 */
153	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START,
154	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM =
155		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START,
156	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO,
157	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_MISS_FIFO,
158	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_BANK_RAM,
159	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_TAG_RAM,
160	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_HIT_FIFO,
161	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_MISS_FIFO,
162	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM,
163	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM,
164	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END =
165		AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM,
166	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_END =
167		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END,
168	/* TA */
169	AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START,
170	AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO =
171		AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START,
172	AMDGPU_RAS_BLOCK__GFX_TA_FS_AFIFO,
173	AMDGPU_RAS_BLOCK__GFX_TA_FL_LFIFO,
174	AMDGPU_RAS_BLOCK__GFX_TA_FX_LFIFO,
175	AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO,
176	AMDGPU_RAS_BLOCK__GFX_TA_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO,
177	/* TCA */
178	AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START,
179	AMDGPU_RAS_BLOCK__GFX_TCA_HOLE_FIFO =
180		AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START,
181	AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO,
182	AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_END =
183		AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO,
184	/* TCC (5 sub-ranges) */
185	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START,
186	/* TCC range 0 */
187	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START =
188		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START,
189	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA =
190		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START,
191	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1,
192	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0,
193	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1,
194	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_0,
195	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_1,
196	AMDGPU_RAS_BLOCK__GFX_TCC_HIGH_RATE_TAG,
197	AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG,
198	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_END =
199		AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG,
200	/* TCC range 1 */
201	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START,
202	AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_DEC =
203		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START,
204	AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER,
205	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_END =
206		AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER,
207	/* TCC range 2 */
208	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START,
209	AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_DATA =
210		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START,
211	AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_CONTROL,
212	AMDGPU_RAS_BLOCK__GFX_TCC_UC_ATOMIC_FIFO,
213	AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_RETURN,
214	AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_CACHE_READ,
215	AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO,
216	AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO_NEXT_RAM,
217	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO,
218	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_END =
219		AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO,
220	/* TCC range 3 */
221	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START,
222	AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO =
223		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START,
224	AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM,
225	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_END =
226		AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM,
227	/* TCC range 4 */
228	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START,
229	AMDGPU_RAS_BLOCK__GFX_TCC_WRRET_TAG_WRITE_RETURN =
230		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START,
231	AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER,
232	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END =
233		AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER,
234	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_END =
235		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END,
236	/* TCI */
237	AMDGPU_RAS_BLOCK__GFX_TCI_WRITE_RAM,
238	/* TCP */
239	AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START,
240	AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM =
241		AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START,
242	AMDGPU_RAS_BLOCK__GFX_TCP_LFIFO_RAM,
243	AMDGPU_RAS_BLOCK__GFX_TCP_CMD_FIFO,
244	AMDGPU_RAS_BLOCK__GFX_TCP_VM_FIFO,
245	AMDGPU_RAS_BLOCK__GFX_TCP_DB_RAM,
246	AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO0,
247	AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1,
248	AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_END =
249		AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1,
250	/* TD */
251	AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START,
252	AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO =
253		AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START,
254	AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_HI,
255	AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO,
256	AMDGPU_RAS_BLOCK__GFX_TD_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO,
257	/* EA (3 sub-ranges) */
258	AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START,
259	/* EA range 0 */
260	AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START =
261		AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START,
262	AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM =
263		AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START,
264	AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_CMDMEM,
265	AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_DATAMEM,
266	AMDGPU_RAS_BLOCK__GFX_EA_RRET_TAGMEM,
267	AMDGPU_RAS_BLOCK__GFX_EA_WRET_TAGMEM,
268	AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_CMDMEM,
269	AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_CMDMEM,
270	AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM,
271	AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_END =
272		AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM,
273	/* EA range 1 */
274	AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START,
275	AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_PAGEMEM =
276		AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START,
277	AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_PAGEMEM,
278	AMDGPU_RAS_BLOCK__GFX_EA_IORD_CMDMEM,
279	AMDGPU_RAS_BLOCK__GFX_EA_IOWR_CMDMEM,
280	AMDGPU_RAS_BLOCK__GFX_EA_IOWR_DATAMEM,
281	AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_PAGEMEM,
282	AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM,
283	AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_END =
284		AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM,
285	/* EA range 2 */
286	AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START,
287	AMDGPU_RAS_BLOCK__GFX_EA_MAM_D0MEM =
288		AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START,
289	AMDGPU_RAS_BLOCK__GFX_EA_MAM_D1MEM,
290	AMDGPU_RAS_BLOCK__GFX_EA_MAM_D2MEM,
291	AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM,
292	AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END =
293		AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM,
294	AMDGPU_RAS_BLOCK__GFX_EA_INDEX_END =
295		AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END,
296	/* UTC VM L2 bank */
297	AMDGPU_RAS_BLOCK__UTC_VML2_BANK_CACHE,
298	/* UTC VM walker */
299	AMDGPU_RAS_BLOCK__UTC_VML2_WALKER,
300	/* UTC ATC L2 2MB cache */
301	AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_2M_BANK,
302	/* UTC ATC L2 4KB cache */
303	AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_4K_BANK,
304	AMDGPU_RAS_BLOCK__GFX_MAX
305};
306
307enum amdgpu_ras_error_type {
308	AMDGPU_RAS_ERROR__NONE					= 0,
309	AMDGPU_RAS_ERROR__PARITY				= 1,
310	AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE			= 2,
311	AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE			= 4,
312	AMDGPU_RAS_ERROR__POISON				= 8,
313};
314
315struct ras_inject_test_config {
316	char name[64];
317	char block[32];
318	int sub_block;
319	enum amdgpu_ras_error_type type;
320	uint64_t address;
321	uint64_t value;
322};
323
324struct ras_common_if {
325	enum amdgpu_ras_block block;
326	enum amdgpu_ras_error_type type;
327	uint32_t sub_block_index;
328	char name[32];
329};
330
331struct ras_inject_if {
332	struct ras_common_if head;
333	uint64_t address;
334	uint64_t value;
335};
336
337struct ras_debug_if {
338	union {
339		struct ras_common_if head;
340		struct ras_inject_if inject;
341	};
342	int op;
343};
344/* for now, only umc, gfx, sdma has implemented. */
345#define DEFAULT_RAS_BLOCK_MASK_INJECT ((1 << AMDGPU_RAS_BLOCK__UMC) |\
346		(1 << AMDGPU_RAS_BLOCK__GFX))
347#define DEFAULT_RAS_BLOCK_MASK_QUERY ((1 << AMDGPU_RAS_BLOCK__UMC) |\
348		(1 << AMDGPU_RAS_BLOCK__GFX))
349#define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\
350		(1 << AMDGPU_RAS_BLOCK__SDMA) |\
351		(1 << AMDGPU_RAS_BLOCK__GFX))
352
353static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT;
354static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT;
355static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC;
356
357struct ras_test_mask {
358	uint32_t inject_mask;
359	uint32_t query_mask;
360	uint32_t basic_mask;
361};
362
363struct amdgpu_ras_data {
364	amdgpu_device_handle device_handle;
365	uint32_t  id;
366	uint32_t  capability;
367	struct ras_test_mask test_mask;
368};
369
370/* all devices who has ras supported */
371static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED];
372static int devices_count;
373
374struct ras_DID_test_mask{
375	uint16_t device_id;
376	uint16_t revision_id;
377	struct ras_test_mask test_mask;
378};
379
380/* white list for inject test. */
381#define RAS_BLOCK_MASK_ALL {\
382	DEFAULT_RAS_BLOCK_MASK_INJECT,\
383	DEFAULT_RAS_BLOCK_MASK_QUERY,\
384	DEFAULT_RAS_BLOCK_MASK_BASIC\
385}
386
387#define RAS_BLOCK_MASK_QUERY_BASIC {\
388	0,\
389	DEFAULT_RAS_BLOCK_MASK_QUERY,\
390	DEFAULT_RAS_BLOCK_MASK_BASIC\
391}
392
393static const struct ras_inject_test_config umc_ras_inject_test[] = {
394	{"ras_umc.1.0", "umc", 0, AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
395};
396
397static const struct ras_inject_test_config gfx_ras_inject_test[] = {
398	{"ras_gfx.2.0", "gfx", AMDGPU_RAS_BLOCK__GFX_CPC_UCODE,
399		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
400	{"ras_gfx.2.1", "gfx", AMDGPU_RAS_BLOCK__GFX_CPF_TAG,
401		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
402	{"ras_gfx.2.2", "gfx", AMDGPU_RAS_BLOCK__GFX_CPG_TAG,
403		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
404	{"ras_gfx.2.3", "gfx", AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D,
405		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
406	{"ras_gfx.2.4", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO,
407		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
408	{"ras_gfx.2.5", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM,
409		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
410	{"ras_gfx.2.6", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM,
411		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
412	{"ras_gfx.2.7", "gfx", AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO,
413		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
414	{"ras_gfx.2.8", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA,
415		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
416	{"ras_gfx.2.9", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1,
417		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
418	{"ras_gfx.2.10", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0,
419		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
420	{"ras_gfx.2.11", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1,
421		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
422	{"ras_gfx.2.12", "gfx", AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM,
423		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
424	{"ras_gfx.2.13", "gfx", AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO,
425		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
426	{"ras_gfx.2.14", "gfx", AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM,
427		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
428};
429
430static const struct ras_DID_test_mask ras_DID_array[] = {
431	{0x66a1, 0x00, RAS_BLOCK_MASK_ALL},
432	{0x66a1, 0x01, RAS_BLOCK_MASK_ALL},
433	{0x66a1, 0x04, RAS_BLOCK_MASK_ALL},
434};
435
436static uint32_t amdgpu_ras_find_block_id_by_name(const char *name)
437{
438	int i;
439
440	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
441		if (strcmp(name, ras_block_string[i]) == 0)
442			return i;
443	}
444
445	return ARRAY_SIZE(ras_block_string);
446}
447
448static char *amdgpu_ras_get_error_type_id(enum amdgpu_ras_error_type type)
449{
450	switch (type) {
451	case AMDGPU_RAS_ERROR__PARITY:
452		return "parity";
453	case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
454		return "single_correctable";
455	case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
456		return "multi_uncorrectable";
457	case AMDGPU_RAS_ERROR__POISON:
458		return "poison";
459	case AMDGPU_RAS_ERROR__NONE:
460	default:
461		return NULL;
462	}
463}
464
465static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device)
466{
467	int i;
468	static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC;
469
470	for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) {
471		if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id &&
472				ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id)
473			return ras_DID_array[i].test_mask;
474	}
475	return default_test_mask;
476}
477
478static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle)
479{
480	union {
481		uint64_t feature_mask;
482		struct {
483			uint32_t enabled_features;
484			uint32_t supported_features;
485		};
486	} features = { 0 };
487	int ret;
488
489	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
490			sizeof(features), &features);
491	if (ret)
492		return 0;
493
494	return features.supported_features;
495}
496
497static int get_file_contents(char *file, char *buf, int size);
498
499static int amdgpu_ras_lookup_id(drmDevicePtr device)
500{
501	char path[1024];
502	char str[128];
503	drmPciBusInfo info;
504	int i;
505	int ret;
506
507	for (i = 0; i < MAX_CARDS_SUPPORTED; i++) {
508		memset(str, 0, sizeof(str));
509		memset(&info, 0, sizeof(info));
510		sprintf(path, "/sys/kernel/debug/dri/%d/name", i);
511		if (get_file_contents(path, str, sizeof(str)) <= 0)
512			continue;
513
514		ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx",
515				&info.domain, &info.bus, &info.dev, &info.func);
516		if (ret != 4)
517			continue;
518
519		if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0)
520				return i;
521	}
522	return -1;
523}
524
525CU_BOOL suite_ras_tests_enable(void)
526{
527	amdgpu_device_handle device_handle;
528	uint32_t  major_version;
529	uint32_t  minor_version;
530	int i;
531	drmDevicePtr device;
532
533	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
534		if (amdgpu_device_initialize(drm_amdgpu[i], &major_version,
535					&minor_version, &device_handle))
536			continue;
537
538		if (drmGetDevice2(drm_amdgpu[i],
539					DRM_DEVICE_GET_PCI_REVISION,
540					&device))
541			continue;
542
543		if (device->bustype == DRM_BUS_PCI &&
544				amdgpu_ras_lookup_capability(device_handle)) {
545			amdgpu_device_deinitialize(device_handle);
546			return CU_TRUE;
547		}
548
549		if (amdgpu_device_deinitialize(device_handle))
550			continue;
551	}
552
553	return CU_FALSE;
554}
555
556int suite_ras_tests_init(void)
557{
558	drmDevicePtr device;
559	amdgpu_device_handle device_handle;
560	uint32_t  major_version;
561	uint32_t  minor_version;
562	uint32_t  capability;
563	struct ras_test_mask test_mask;
564	int id;
565	int i;
566	int r;
567
568	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
569		r = amdgpu_device_initialize(drm_amdgpu[i], &major_version,
570				&minor_version, &device_handle);
571		if (r)
572			continue;
573
574		if (drmGetDevice2(drm_amdgpu[i],
575					DRM_DEVICE_GET_PCI_REVISION,
576					&device)) {
577			amdgpu_device_deinitialize(device_handle);
578			continue;
579		}
580
581		if (device->bustype != DRM_BUS_PCI) {
582			amdgpu_device_deinitialize(device_handle);
583			continue;
584		}
585
586		capability = amdgpu_ras_lookup_capability(device_handle);
587		if (capability == 0) {
588			amdgpu_device_deinitialize(device_handle);
589			continue;
590
591		}
592
593		id = amdgpu_ras_lookup_id(device);
594		if (id == -1) {
595			amdgpu_device_deinitialize(device_handle);
596			continue;
597		}
598
599		test_mask = amdgpu_ras_get_test_mask(device);
600
601		devices[devices_count++] = (struct amdgpu_ras_data) {
602			device_handle, id, capability, test_mask,
603		};
604	}
605
606	if (devices_count == 0)
607		return CUE_SINIT_FAILED;
608
609	return CUE_SUCCESS;
610}
611
612int suite_ras_tests_clean(void)
613{
614	int r;
615	int i;
616	int ret = CUE_SUCCESS;
617
618	for (i = 0; i < devices_count; i++) {
619		r = amdgpu_device_deinitialize(devices[i].device_handle);
620		if (r)
621			ret = CUE_SCLEAN_FAILED;
622	}
623	return ret;
624}
625
626static void amdgpu_ras_disable_test(void);
627static void amdgpu_ras_enable_test(void);
628static void amdgpu_ras_inject_test(void);
629static void amdgpu_ras_query_test(void);
630static void amdgpu_ras_basic_test(void);
631
632CU_TestInfo ras_tests[] = {
633	{ "ras basic test",	amdgpu_ras_basic_test },
634	{ "ras query test",	amdgpu_ras_query_test },
635	{ "ras inject test",	amdgpu_ras_inject_test },
636	{ "ras disable test",	amdgpu_ras_disable_test },
637#if 0
638	{ "ras enable test",	amdgpu_ras_enable_test },
639#endif
640	CU_TEST_INFO_NULL,
641};
642
643//helpers
644
645static int test_card;
646static char sysfs_path[1024];
647static char debugfs_path[1024];
648static uint32_t ras_mask;
649static amdgpu_device_handle device_handle;
650
651static int set_test_card(int card)
652{
653	int i;
654
655	test_card = card;
656	sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id);
657	sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id);
658	ras_mask = devices[card].capability;
659	device_handle = devices[card].device_handle;
660	ras_block_mask_inject = devices[card].test_mask.inject_mask;
661	ras_block_mask_query = devices[card].test_mask.query_mask;
662	ras_block_mask_basic = devices[card].test_mask.basic_mask;
663
664	return 0;
665}
666
667static const char *get_ras_sysfs_root(void)
668{
669	return sysfs_path;
670}
671
672static const char *get_ras_debugfs_root(void)
673{
674	return debugfs_path;
675}
676
677static int set_file_contents(char *file, char *buf, int size)
678{
679	int n, fd;
680	fd = open(file, O_WRONLY);
681	if (fd == -1)
682		return -1;
683	n = write(fd, buf, size);
684	close(fd);
685	return n;
686}
687
688static int get_file_contents(char *file, char *buf, int size)
689{
690	int n, fd;
691	fd = open(file, O_RDONLY);
692	if (fd == -1)
693		return -1;
694	n = read(fd, buf, size);
695	close(fd);
696	return n;
697}
698
699static int is_file_ok(char *file, int flags)
700{
701	int fd;
702
703	fd = open(file, flags);
704	if (fd == -1)
705		return -1;
706	close(fd);
707	return 0;
708}
709
710static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block)
711{
712	uint32_t feature_mask;
713	int ret;
714
715	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
716			sizeof(feature_mask), &feature_mask);
717	if (ret)
718		return -1;
719
720	return (1 << block) & feature_mask;
721}
722
723static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block)
724{
725	return (1 << block) & ras_mask;
726}
727
728static int amdgpu_ras_invoke(struct ras_debug_if *data)
729{
730	char path[1024];
731	int ret;
732
733	sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
734
735	ret = set_file_contents(path, (char *)data, sizeof(*data))
736		- sizeof(*data);
737	return ret;
738}
739
740static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block,
741		unsigned long *ue, unsigned long *ce)
742{
743	char buf[64];
744	char name[1024];
745	int ret;
746
747	*ue = *ce = 0;
748
749	if (amdgpu_ras_is_feature_supported(block) <= 0)
750		return -1;
751
752	sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count");
753
754	if (is_file_ok(name, O_RDONLY))
755		return 0;
756
757	if (get_file_contents(name, buf, sizeof(buf)) <= 0)
758		return -1;
759
760	if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2)
761		return -1;
762
763	return 0;
764}
765
766static int amdgpu_ras_inject(enum amdgpu_ras_block block,
767		uint32_t sub_block, enum amdgpu_ras_error_type type,
768		uint64_t address, uint64_t value)
769{
770	struct ras_debug_if data = { .op = 2, };
771	struct ras_inject_if *inject = &data.inject;
772	int ret;
773
774	if (amdgpu_ras_is_feature_enabled(block) <= 0) {
775		fprintf(stderr, "block id(%d) is not valid\n", block);
776		return -1;
777	}
778
779	inject->head.block = block;
780	inject->head.type = type;
781	inject->head.sub_block_index = sub_block;
782	strncpy(inject->head.name, ras_block_str(block), 32);
783	inject->address = address;
784	inject->value = value;
785
786	ret = amdgpu_ras_invoke(&data);
787	CU_ASSERT_EQUAL(ret, 0);
788	if (ret)
789		return -1;
790
791	return 0;
792}
793
794//tests
795static void amdgpu_ras_features_test(int enable)
796{
797	struct ras_debug_if data;
798	int ret;
799	int i;
800
801	data.op = enable;
802	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
803		struct ras_common_if head = {
804			.block = i,
805			.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
806			.sub_block_index = 0,
807			.name = "",
808		};
809
810		if (amdgpu_ras_is_feature_supported(i) <= 0)
811			continue;
812
813		data.head = head;
814
815		ret = amdgpu_ras_invoke(&data);
816		CU_ASSERT_EQUAL(ret, 0);
817
818		if (ret)
819			continue;
820
821		ret = enable ^ amdgpu_ras_is_feature_enabled(i);
822		CU_ASSERT_EQUAL(ret, 0);
823	}
824}
825
826static void amdgpu_ras_disable_test(void)
827{
828	int i;
829	for (i = 0; i < devices_count; i++) {
830		set_test_card(i);
831		amdgpu_ras_features_test(0);
832	}
833}
834
835static void amdgpu_ras_enable_test(void)
836{
837	int i;
838	for (i = 0; i < devices_count; i++) {
839		set_test_card(i);
840		amdgpu_ras_features_test(1);
841	}
842}
843
844static void __amdgpu_ras_ip_inject_test(const struct ras_inject_test_config *ip_test,
845					uint32_t size)
846{
847	int i, ret;
848	unsigned long old_ue, old_ce;
849	unsigned long ue, ce;
850	uint32_t block;
851	int timeout;
852	bool pass;
853
854	for (i = 0; i < size; i++) {
855		timeout = 3;
856		pass = false;
857
858		block = amdgpu_ras_find_block_id_by_name(ip_test[i].block);
859
860		/* Ensure one valid ip block */
861		if (block == ARRAY_SIZE(ras_block_string))
862			break;
863
864		/* Ensure RAS feature for the IP block is enabled by kernel */
865		if (amdgpu_ras_is_feature_supported(block) <= 0)
866			break;
867
868		ret = amdgpu_ras_query_err_count(block, &old_ue, &old_ce);
869		CU_ASSERT_EQUAL(ret, 0);
870		if (ret)
871			break;
872
873		ret = amdgpu_ras_inject(block,
874					ip_test[i].sub_block,
875					ip_test[i].type,
876					ip_test[i].address,
877					ip_test[i].value);
878		CU_ASSERT_EQUAL(ret, 0);
879		if (ret)
880			break;
881
882		while (timeout > 0) {
883			sleep(5);
884
885			ret = amdgpu_ras_query_err_count(block, &ue, &ce);
886			CU_ASSERT_EQUAL(ret, 0);
887			if (ret)
888				break;
889
890			if (old_ue != ue || old_ce != ce) {
891				pass = true;
892				sleep(20);
893				break;
894			}
895			timeout -= 1;
896		}
897		printf("\t Test %s@block %s, subblock %d, error_type %s, address %ld, value %ld: %s\n",
898			ip_test[i].name,
899			ip_test[i].block,
900			ip_test[i].sub_block,
901			amdgpu_ras_get_error_type_id(ip_test[i].type),
902			ip_test[i].address,
903			ip_test[i].value,
904			pass ? "Pass" : "Fail");
905	}
906}
907
908static void __amdgpu_ras_inject_test(void)
909{
910	printf("...\n");
911
912	/* run UMC ras inject test */
913	__amdgpu_ras_ip_inject_test(umc_ras_inject_test,
914		ARRAY_SIZE(umc_ras_inject_test));
915
916	/* run GFX ras inject test */
917	__amdgpu_ras_ip_inject_test(gfx_ras_inject_test,
918		ARRAY_SIZE(gfx_ras_inject_test));
919}
920
921static void amdgpu_ras_inject_test(void)
922{
923	int i;
924	for (i = 0; i < devices_count; i++) {
925		set_test_card(i);
926		__amdgpu_ras_inject_test();
927	}
928}
929
930static void __amdgpu_ras_query_test(void)
931{
932	unsigned long ue, ce;
933	int ret;
934	int i;
935
936	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
937		if (amdgpu_ras_is_feature_supported(i) <= 0)
938			continue;
939
940		if (!((1 << i) & ras_block_mask_query))
941			continue;
942
943		ret = amdgpu_ras_query_err_count(i, &ue, &ce);
944		CU_ASSERT_EQUAL(ret, 0);
945	}
946}
947
948static void amdgpu_ras_query_test(void)
949{
950	int i;
951	for (i = 0; i < devices_count; i++) {
952		set_test_card(i);
953		__amdgpu_ras_query_test();
954	}
955}
956
957static void amdgpu_ras_basic_test(void)
958{
959	unsigned long ue, ce;
960	char name[1024];
961	int ret;
962	int i;
963	int j;
964	uint32_t features;
965	char path[1024];
966
967	ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY);
968	CU_ASSERT_EQUAL(ret, 0);
969
970	for (i = 0; i < devices_count; i++) {
971		set_test_card(i);
972
973		ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
974				sizeof(features), &features);
975		CU_ASSERT_EQUAL(ret, 0);
976
977		sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
978		ret = is_file_ok(path, O_WRONLY);
979		CU_ASSERT_EQUAL(ret, 0);
980
981		sprintf(path, "%s%s", get_ras_sysfs_root(), "features");
982		ret = is_file_ok(path, O_RDONLY);
983		CU_ASSERT_EQUAL(ret, 0);
984
985		for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) {
986			ret = amdgpu_ras_is_feature_supported(j);
987			if (ret <= 0)
988				continue;
989
990			if (!((1 << j) & ras_block_mask_basic))
991				continue;
992
993			sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count");
994			ret = is_file_ok(path, O_RDONLY);
995			CU_ASSERT_EQUAL(ret, 0);
996
997			sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject");
998			ret = is_file_ok(path, O_WRONLY);
999			CU_ASSERT_EQUAL(ret, 0);
1000		}
1001	}
1002}
1003