1/*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22*/
23
24#include "CUnit/Basic.h"
25
26#include "amdgpu_test.h"
27#include "amdgpu_drm.h"
28#include "amdgpu_internal.h"
29#include <unistd.h>
30#include <fcntl.h>
31#include <stdio.h>
32#include "xf86drm.h"
33#include <limits.h>
34
35#define PATH_SIZE PATH_MAX
36
37#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
38
39const char *ras_block_string[] = {
40	"umc",
41	"sdma",
42	"gfx",
43	"mmhub",
44	"athub",
45	"pcie_bif",
46	"hdp",
47	"xgmi_wafl",
48	"df",
49	"smn",
50	"sem",
51	"mp0",
52	"mp1",
53	"fuse",
54};
55
56#define ras_block_str(i) (ras_block_string[i])
57
58enum amdgpu_ras_block {
59	AMDGPU_RAS_BLOCK__UMC = 0,
60	AMDGPU_RAS_BLOCK__SDMA,
61	AMDGPU_RAS_BLOCK__GFX,
62	AMDGPU_RAS_BLOCK__MMHUB,
63	AMDGPU_RAS_BLOCK__ATHUB,
64	AMDGPU_RAS_BLOCK__PCIE_BIF,
65	AMDGPU_RAS_BLOCK__HDP,
66	AMDGPU_RAS_BLOCK__XGMI_WAFL,
67	AMDGPU_RAS_BLOCK__DF,
68	AMDGPU_RAS_BLOCK__SMN,
69	AMDGPU_RAS_BLOCK__SEM,
70	AMDGPU_RAS_BLOCK__MP0,
71	AMDGPU_RAS_BLOCK__MP1,
72	AMDGPU_RAS_BLOCK__FUSE,
73
74	AMDGPU_RAS_BLOCK__LAST
75};
76
77#define AMDGPU_RAS_BLOCK_COUNT  AMDGPU_RAS_BLOCK__LAST
78#define AMDGPU_RAS_BLOCK_MASK   ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
79
80enum amdgpu_ras_gfx_subblock {
81	/* CPC */
82	AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START = 0,
83	AMDGPU_RAS_BLOCK__GFX_CPC_SCRATCH =
84		AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START,
85	AMDGPU_RAS_BLOCK__GFX_CPC_UCODE,
86	AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME1,
87	AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME1,
88	AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME1,
89	AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME2,
90	AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME2,
91	AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2,
92	AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_END =
93		AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2,
94	/* CPF */
95	AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START,
96	AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME2 =
97		AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START,
98	AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME1,
99	AMDGPU_RAS_BLOCK__GFX_CPF_TAG,
100	AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPF_TAG,
101	/* CPG */
102	AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START,
103	AMDGPU_RAS_BLOCK__GFX_CPG_DMA_ROQ =
104		AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START,
105	AMDGPU_RAS_BLOCK__GFX_CPG_DMA_TAG,
106	AMDGPU_RAS_BLOCK__GFX_CPG_TAG,
107	AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPG_TAG,
108	/* GDS */
109	AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START,
110	AMDGPU_RAS_BLOCK__GFX_GDS_MEM = AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START,
111	AMDGPU_RAS_BLOCK__GFX_GDS_INPUT_QUEUE,
112	AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_CMD_RAM_MEM,
113	AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_DATA_RAM_MEM,
114	AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM,
115	AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_END =
116		AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM,
117	/* SPI */
118	AMDGPU_RAS_BLOCK__GFX_SPI_SR_MEM,
119	/* SQ */
120	AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START,
121	AMDGPU_RAS_BLOCK__GFX_SQ_SGPR = AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START,
122	AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D,
123	AMDGPU_RAS_BLOCK__GFX_SQ_LDS_I,
124	AMDGPU_RAS_BLOCK__GFX_SQ_VGPR,
125	AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_END = AMDGPU_RAS_BLOCK__GFX_SQ_VGPR,
126	/* SQC (3 ranges) */
127	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START,
128	/* SQC range 0 */
129	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START =
130		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START,
131	AMDGPU_RAS_BLOCK__GFX_SQC_INST_UTCL1_LFIFO =
132		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START,
133	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_WRITE_DATA_BUF,
134	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_UTCL1_LFIFO,
135	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_WRITE_DATA_BUF,
136	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO,
137	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_WRITE_DATA_BUF,
138	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO,
139	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_END =
140		AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO,
141	/* SQC range 1 */
142	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START,
143	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM =
144		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START,
145	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO,
146	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_MISS_FIFO,
147	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_BANK_RAM,
148	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_TAG_RAM,
149	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_HIT_FIFO,
150	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_MISS_FIFO,
151	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM,
152	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM,
153	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_END =
154		AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM,
155	/* SQC range 2 */
156	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START,
157	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM =
158		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START,
159	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO,
160	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_MISS_FIFO,
161	AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_BANK_RAM,
162	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_TAG_RAM,
163	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_HIT_FIFO,
164	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_MISS_FIFO,
165	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM,
166	AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM,
167	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END =
168		AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM,
169	AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_END =
170		AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END,
171	/* TA */
172	AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START,
173	AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO =
174		AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START,
175	AMDGPU_RAS_BLOCK__GFX_TA_FS_AFIFO,
176	AMDGPU_RAS_BLOCK__GFX_TA_FL_LFIFO,
177	AMDGPU_RAS_BLOCK__GFX_TA_FX_LFIFO,
178	AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO,
179	AMDGPU_RAS_BLOCK__GFX_TA_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO,
180	/* TCA */
181	AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START,
182	AMDGPU_RAS_BLOCK__GFX_TCA_HOLE_FIFO =
183		AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START,
184	AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO,
185	AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_END =
186		AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO,
187	/* TCC (5 sub-ranges) */
188	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START,
189	/* TCC range 0 */
190	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START =
191		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START,
192	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA =
193		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START,
194	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1,
195	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0,
196	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1,
197	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_0,
198	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_1,
199	AMDGPU_RAS_BLOCK__GFX_TCC_HIGH_RATE_TAG,
200	AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG,
201	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_END =
202		AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG,
203	/* TCC range 1 */
204	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START,
205	AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_DEC =
206		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START,
207	AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER,
208	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_END =
209		AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER,
210	/* TCC range 2 */
211	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START,
212	AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_DATA =
213		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START,
214	AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_CONTROL,
215	AMDGPU_RAS_BLOCK__GFX_TCC_UC_ATOMIC_FIFO,
216	AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_RETURN,
217	AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_CACHE_READ,
218	AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO,
219	AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO_NEXT_RAM,
220	AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO,
221	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_END =
222		AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO,
223	/* TCC range 3 */
224	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START,
225	AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO =
226		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START,
227	AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM,
228	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_END =
229		AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM,
230	/* TCC range 4 */
231	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START,
232	AMDGPU_RAS_BLOCK__GFX_TCC_WRRET_TAG_WRITE_RETURN =
233		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START,
234	AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER,
235	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END =
236		AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER,
237	AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_END =
238		AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END,
239	/* TCI */
240	AMDGPU_RAS_BLOCK__GFX_TCI_WRITE_RAM,
241	/* TCP */
242	AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START,
243	AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM =
244		AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START,
245	AMDGPU_RAS_BLOCK__GFX_TCP_LFIFO_RAM,
246	AMDGPU_RAS_BLOCK__GFX_TCP_CMD_FIFO,
247	AMDGPU_RAS_BLOCK__GFX_TCP_VM_FIFO,
248	AMDGPU_RAS_BLOCK__GFX_TCP_DB_RAM,
249	AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO0,
250	AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1,
251	AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_END =
252		AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1,
253	/* TD */
254	AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START,
255	AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO =
256		AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START,
257	AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_HI,
258	AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO,
259	AMDGPU_RAS_BLOCK__GFX_TD_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO,
260	/* EA (3 sub-ranges) */
261	AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START,
262	/* EA range 0 */
263	AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START =
264		AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START,
265	AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM =
266		AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START,
267	AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_CMDMEM,
268	AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_DATAMEM,
269	AMDGPU_RAS_BLOCK__GFX_EA_RRET_TAGMEM,
270	AMDGPU_RAS_BLOCK__GFX_EA_WRET_TAGMEM,
271	AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_CMDMEM,
272	AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_CMDMEM,
273	AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM,
274	AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_END =
275		AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM,
276	/* EA range 1 */
277	AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START,
278	AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_PAGEMEM =
279		AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START,
280	AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_PAGEMEM,
281	AMDGPU_RAS_BLOCK__GFX_EA_IORD_CMDMEM,
282	AMDGPU_RAS_BLOCK__GFX_EA_IOWR_CMDMEM,
283	AMDGPU_RAS_BLOCK__GFX_EA_IOWR_DATAMEM,
284	AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_PAGEMEM,
285	AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM,
286	AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_END =
287		AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM,
288	/* EA range 2 */
289	AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START,
290	AMDGPU_RAS_BLOCK__GFX_EA_MAM_D0MEM =
291		AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START,
292	AMDGPU_RAS_BLOCK__GFX_EA_MAM_D1MEM,
293	AMDGPU_RAS_BLOCK__GFX_EA_MAM_D2MEM,
294	AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM,
295	AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END =
296		AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM,
297	AMDGPU_RAS_BLOCK__GFX_EA_INDEX_END =
298		AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END,
299	/* UTC VM L2 bank */
300	AMDGPU_RAS_BLOCK__UTC_VML2_BANK_CACHE,
301	/* UTC VM walker */
302	AMDGPU_RAS_BLOCK__UTC_VML2_WALKER,
303	/* UTC ATC L2 2MB cache */
304	AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_2M_BANK,
305	/* UTC ATC L2 4KB cache */
306	AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_4K_BANK,
307	AMDGPU_RAS_BLOCK__GFX_MAX
308};
309
310enum amdgpu_ras_error_type {
311	AMDGPU_RAS_ERROR__NONE					= 0,
312	AMDGPU_RAS_ERROR__PARITY				= 1,
313	AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE			= 2,
314	AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE			= 4,
315	AMDGPU_RAS_ERROR__POISON				= 8,
316};
317
318struct ras_inject_test_config {
319	char name[64];
320	char block[32];
321	int sub_block;
322	enum amdgpu_ras_error_type type;
323	uint64_t address;
324	uint64_t value;
325};
326
327struct ras_common_if {
328	enum amdgpu_ras_block block;
329	enum amdgpu_ras_error_type type;
330	uint32_t sub_block_index;
331	char name[32];
332};
333
334struct ras_inject_if {
335	struct ras_common_if head;
336	uint64_t address;
337	uint64_t value;
338};
339
340struct ras_debug_if {
341	union {
342		struct ras_common_if head;
343		struct ras_inject_if inject;
344	};
345	int op;
346};
347/* for now, only umc, gfx, sdma has implemented. */
348#define DEFAULT_RAS_BLOCK_MASK_INJECT ((1 << AMDGPU_RAS_BLOCK__UMC) |\
349		(1 << AMDGPU_RAS_BLOCK__GFX))
350#define DEFAULT_RAS_BLOCK_MASK_QUERY ((1 << AMDGPU_RAS_BLOCK__UMC) |\
351		(1 << AMDGPU_RAS_BLOCK__GFX))
352#define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\
353		(1 << AMDGPU_RAS_BLOCK__SDMA) |\
354		(1 << AMDGPU_RAS_BLOCK__GFX))
355
356static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT;
357static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT;
358static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC;
359
360struct ras_test_mask {
361	uint32_t inject_mask;
362	uint32_t query_mask;
363	uint32_t basic_mask;
364};
365
366struct amdgpu_ras_data {
367	amdgpu_device_handle device_handle;
368	uint32_t  id;
369	uint32_t  capability;
370	struct ras_test_mask test_mask;
371};
372
373/* all devices who has ras supported */
374static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED];
375static int devices_count;
376
377struct ras_DID_test_mask{
378	uint16_t device_id;
379	uint16_t revision_id;
380	struct ras_test_mask test_mask;
381};
382
383/* white list for inject test. */
384#define RAS_BLOCK_MASK_ALL {\
385	DEFAULT_RAS_BLOCK_MASK_INJECT,\
386	DEFAULT_RAS_BLOCK_MASK_QUERY,\
387	DEFAULT_RAS_BLOCK_MASK_BASIC\
388}
389
390#define RAS_BLOCK_MASK_QUERY_BASIC {\
391	0,\
392	DEFAULT_RAS_BLOCK_MASK_QUERY,\
393	DEFAULT_RAS_BLOCK_MASK_BASIC\
394}
395
396static const struct ras_inject_test_config umc_ras_inject_test[] = {
397	{"ras_umc.1.0", "umc", 0, AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
398};
399
400static const struct ras_inject_test_config gfx_ras_inject_test[] = {
401	{"ras_gfx.2.0", "gfx", AMDGPU_RAS_BLOCK__GFX_CPC_UCODE,
402		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
403	{"ras_gfx.2.1", "gfx", AMDGPU_RAS_BLOCK__GFX_CPF_TAG,
404		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
405	{"ras_gfx.2.2", "gfx", AMDGPU_RAS_BLOCK__GFX_CPG_TAG,
406		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
407	{"ras_gfx.2.3", "gfx", AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D,
408		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
409	{"ras_gfx.2.4", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO,
410		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
411	{"ras_gfx.2.5", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM,
412		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
413	{"ras_gfx.2.6", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM,
414		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
415	{"ras_gfx.2.7", "gfx", AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO,
416		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
417	{"ras_gfx.2.8", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA,
418		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
419	{"ras_gfx.2.9", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1,
420		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
421	{"ras_gfx.2.10", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0,
422		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
423	{"ras_gfx.2.11", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1,
424		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
425	{"ras_gfx.2.12", "gfx", AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM,
426		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
427	{"ras_gfx.2.13", "gfx", AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO,
428		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
429	{"ras_gfx.2.14", "gfx", AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM,
430		AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
431};
432
433static const struct ras_DID_test_mask ras_DID_array[] = {
434	{0x66a1, 0x00, RAS_BLOCK_MASK_ALL},
435	{0x66a1, 0x01, RAS_BLOCK_MASK_ALL},
436	{0x66a1, 0x04, RAS_BLOCK_MASK_ALL},
437};
438
439static uint32_t amdgpu_ras_find_block_id_by_name(const char *name)
440{
441	int i;
442
443	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
444		if (strcmp(name, ras_block_string[i]) == 0)
445			return i;
446	}
447
448	return ARRAY_SIZE(ras_block_string);
449}
450
451static char *amdgpu_ras_get_error_type_id(enum amdgpu_ras_error_type type)
452{
453	switch (type) {
454	case AMDGPU_RAS_ERROR__PARITY:
455		return "parity";
456	case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
457		return "single_correctable";
458	case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
459		return "multi_uncorrectable";
460	case AMDGPU_RAS_ERROR__POISON:
461		return "poison";
462	case AMDGPU_RAS_ERROR__NONE:
463	default:
464		return NULL;
465	}
466}
467
468static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device)
469{
470	int i;
471	static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC;
472
473	for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) {
474		if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id &&
475				ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id)
476			return ras_DID_array[i].test_mask;
477	}
478	return default_test_mask;
479}
480
481static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle)
482{
483	union {
484		uint64_t feature_mask;
485		struct {
486			uint32_t enabled_features;
487			uint32_t supported_features;
488		};
489	} features = { 0 };
490	int ret;
491
492	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
493			sizeof(features), &features);
494	if (ret)
495		return 0;
496
497	return features.supported_features;
498}
499
500static int get_file_contents(char *file, char *buf, int size);
501
502static int amdgpu_ras_lookup_id(drmDevicePtr device)
503{
504	char path[PATH_SIZE];
505	char str[128];
506	drmPciBusInfo info;
507	int i;
508	int ret;
509
510	for (i = 0; i < MAX_CARDS_SUPPORTED; i++) {
511		memset(str, 0, sizeof(str));
512		memset(&info, 0, sizeof(info));
513		snprintf(path, PATH_SIZE, "/sys/kernel/debug/dri/%d/name", i);
514		if (get_file_contents(path, str, sizeof(str)) <= 0)
515			continue;
516
517		ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx",
518				&info.domain, &info.bus, &info.dev, &info.func);
519		if (ret != 4)
520			continue;
521
522		if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0)
523				return i;
524	}
525	return -1;
526}
527
528//helpers
529
530static int test_card;
531static char sysfs_path[PATH_SIZE];
532static char debugfs_path[PATH_SIZE];
533static uint32_t ras_mask;
534static amdgpu_device_handle device_handle;
535
536static void set_test_card(int card)
537{
538	test_card = card;
539	snprintf(sysfs_path, PATH_SIZE, "/sys/class/drm/card%d/device/ras/", devices[card].id);
540	snprintf(debugfs_path, PATH_SIZE,  "/sys/kernel/debug/dri/%d/ras/", devices[card].id);
541	ras_mask = devices[card].capability;
542	device_handle = devices[card].device_handle;
543	ras_block_mask_inject = devices[card].test_mask.inject_mask;
544	ras_block_mask_query = devices[card].test_mask.query_mask;
545	ras_block_mask_basic = devices[card].test_mask.basic_mask;
546}
547
548static const char *get_ras_sysfs_root(void)
549{
550	return sysfs_path;
551}
552
553static const char *get_ras_debugfs_root(void)
554{
555	return debugfs_path;
556}
557
558static int set_file_contents(char *file, char *buf, int size)
559{
560	int n, fd;
561	fd = open(file, O_WRONLY);
562	if (fd == -1)
563		return -1;
564	n = write(fd, buf, size);
565	close(fd);
566	return n;
567}
568
569static int get_file_contents(char *file, char *buf, int size)
570{
571	int n, fd;
572	fd = open(file, O_RDONLY);
573	if (fd == -1)
574		return -1;
575	n = read(fd, buf, size);
576	close(fd);
577	return n;
578}
579
580static int is_file_ok(char *file, int flags)
581{
582	int fd;
583
584	fd = open(file, flags);
585	if (fd == -1)
586		return -1;
587	close(fd);
588	return 0;
589}
590
591static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block)
592{
593	uint32_t feature_mask;
594	int ret;
595
596	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
597			sizeof(feature_mask), &feature_mask);
598	if (ret)
599		return -1;
600
601	return (1 << block) & feature_mask;
602}
603
604static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block)
605{
606	return (1 << block) & ras_mask;
607}
608
609static int amdgpu_ras_invoke(struct ras_debug_if *data)
610{
611	char path[PATH_SIZE];
612	int ret;
613
614	snprintf(path, sizeof(path), "%s", get_ras_debugfs_root());
615	strncat(path, "ras_ctrl", sizeof(path) - strlen(path));
616
617	ret = set_file_contents(path, (char *)data, sizeof(*data))
618		- sizeof(*data);
619	return ret;
620}
621
622static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block,
623		unsigned long *ue, unsigned long *ce)
624{
625	char buf[64];
626	char name[PATH_SIZE];
627
628	*ue = *ce = 0;
629
630	if (amdgpu_ras_is_feature_supported(block) <= 0)
631		return -1;
632
633	snprintf(name, sizeof(name), "%s", get_ras_sysfs_root());
634	strncat(name, ras_block_str(block), sizeof(name) - strlen(name));
635	strncat(name, "_err_count", sizeof(name) - strlen(name));
636
637	if (is_file_ok(name, O_RDONLY))
638		return 0;
639
640	if (get_file_contents(name, buf, sizeof(buf)) <= 0)
641		return -1;
642
643	if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2)
644		return -1;
645
646	return 0;
647}
648
649static int amdgpu_ras_inject(enum amdgpu_ras_block block,
650		uint32_t sub_block, enum amdgpu_ras_error_type type,
651		uint64_t address, uint64_t value)
652{
653	struct ras_debug_if data = { .op = 2, };
654	struct ras_inject_if *inject = &data.inject;
655	int ret;
656
657	if (amdgpu_ras_is_feature_enabled(block) <= 0) {
658		fprintf(stderr, "block id(%d) is not valid\n", block);
659		return -1;
660	}
661
662	inject->head.block = block;
663	inject->head.type = type;
664	inject->head.sub_block_index = sub_block;
665	strncpy(inject->head.name, ras_block_str(block), sizeof(inject->head.name)-1);
666	inject->address = address;
667	inject->value = value;
668
669	ret = amdgpu_ras_invoke(&data);
670	CU_ASSERT_EQUAL(ret, 0);
671	if (ret)
672		return -1;
673
674	return 0;
675}
676
677//tests
678static void amdgpu_ras_features_test(int enable)
679{
680	struct ras_debug_if data;
681	int ret;
682	int i;
683
684	data.op = enable;
685	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
686		struct ras_common_if head = {
687			.block = i,
688			.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
689			.sub_block_index = 0,
690			.name = "",
691		};
692
693		if (amdgpu_ras_is_feature_supported(i) <= 0)
694			continue;
695
696		data.head = head;
697
698		ret = amdgpu_ras_invoke(&data);
699		CU_ASSERT_EQUAL(ret, 0);
700
701		if (ret)
702			continue;
703
704		ret = enable ^ amdgpu_ras_is_feature_enabled(i);
705		CU_ASSERT_EQUAL(ret, 0);
706	}
707}
708
709static void amdgpu_ras_disable_test(void)
710{
711	int i;
712	for (i = 0; i < devices_count; i++) {
713		set_test_card(i);
714		amdgpu_ras_features_test(0);
715	}
716}
717
718static void amdgpu_ras_enable_test(void)
719{
720	int i;
721	for (i = 0; i < devices_count; i++) {
722		set_test_card(i);
723		amdgpu_ras_features_test(1);
724	}
725}
726
727static void __amdgpu_ras_ip_inject_test(const struct ras_inject_test_config *ip_test,
728					uint32_t size)
729{
730	int i, ret;
731	unsigned long old_ue, old_ce;
732	unsigned long ue, ce;
733	uint32_t block;
734	int timeout;
735	bool pass;
736
737	for (i = 0; i < size; i++) {
738		timeout = 3;
739		pass = false;
740
741		block = amdgpu_ras_find_block_id_by_name(ip_test[i].block);
742
743		/* Ensure one valid ip block */
744		if (block == ARRAY_SIZE(ras_block_string))
745			break;
746
747		/* Ensure RAS feature for the IP block is enabled by kernel */
748		if (amdgpu_ras_is_feature_supported(block) <= 0)
749			break;
750
751		ret = amdgpu_ras_query_err_count(block, &old_ue, &old_ce);
752		CU_ASSERT_EQUAL(ret, 0);
753		if (ret)
754			break;
755
756		ret = amdgpu_ras_inject(block,
757					ip_test[i].sub_block,
758					ip_test[i].type,
759					ip_test[i].address,
760					ip_test[i].value);
761		CU_ASSERT_EQUAL(ret, 0);
762		if (ret)
763			break;
764
765		while (timeout > 0) {
766			sleep(5);
767
768			ret = amdgpu_ras_query_err_count(block, &ue, &ce);
769			CU_ASSERT_EQUAL(ret, 0);
770			if (ret)
771				break;
772
773			if (old_ue != ue || old_ce != ce) {
774				pass = true;
775				sleep(20);
776				break;
777			}
778			timeout -= 1;
779		}
780		printf("\t Test %s@block %s, subblock %d, error_type %s, address %ld, value %ld: %s\n",
781			ip_test[i].name,
782			ip_test[i].block,
783			ip_test[i].sub_block,
784			amdgpu_ras_get_error_type_id(ip_test[i].type),
785			ip_test[i].address,
786			ip_test[i].value,
787			pass ? "Pass" : "Fail");
788	}
789}
790
791static void __amdgpu_ras_inject_test(void)
792{
793	printf("...\n");
794
795	/* run UMC ras inject test */
796	__amdgpu_ras_ip_inject_test(umc_ras_inject_test,
797		ARRAY_SIZE(umc_ras_inject_test));
798
799	/* run GFX ras inject test */
800	__amdgpu_ras_ip_inject_test(gfx_ras_inject_test,
801		ARRAY_SIZE(gfx_ras_inject_test));
802}
803
804static void amdgpu_ras_inject_test(void)
805{
806	int i;
807	for (i = 0; i < devices_count; i++) {
808		set_test_card(i);
809		__amdgpu_ras_inject_test();
810	}
811}
812
813static void __amdgpu_ras_query_test(void)
814{
815	unsigned long ue, ce;
816	int ret;
817	int i;
818
819	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
820		if (amdgpu_ras_is_feature_supported(i) <= 0)
821			continue;
822
823		if (!((1 << i) & ras_block_mask_query))
824			continue;
825
826		ret = amdgpu_ras_query_err_count(i, &ue, &ce);
827		CU_ASSERT_EQUAL(ret, 0);
828	}
829}
830
831static void amdgpu_ras_query_test(void)
832{
833	int i;
834	for (i = 0; i < devices_count; i++) {
835		set_test_card(i);
836		__amdgpu_ras_query_test();
837	}
838}
839
840static void amdgpu_ras_basic_test(void)
841{
842	int ret;
843	int i;
844	int j;
845	uint32_t features;
846	char path[PATH_SIZE];
847
848	ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY);
849	CU_ASSERT_EQUAL(ret, 0);
850
851	for (i = 0; i < devices_count; i++) {
852		set_test_card(i);
853
854		ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
855				sizeof(features), &features);
856		CU_ASSERT_EQUAL(ret, 0);
857
858		snprintf(path, sizeof(path), "%s", get_ras_debugfs_root());
859		strncat(path, "ras_ctrl", sizeof(path) - strlen(path));
860
861		ret = is_file_ok(path, O_WRONLY);
862		CU_ASSERT_EQUAL(ret, 0);
863
864		snprintf(path, sizeof(path), "%s", get_ras_sysfs_root());
865		strncat(path, "features", sizeof(path) - strlen(path));
866
867		ret = is_file_ok(path, O_RDONLY);
868		CU_ASSERT_EQUAL(ret, 0);
869
870		for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) {
871			ret = amdgpu_ras_is_feature_supported(j);
872			if (ret <= 0)
873				continue;
874
875			if (!((1 << j) & ras_block_mask_basic))
876				continue;
877
878			snprintf(path, sizeof(path), "%s", get_ras_sysfs_root());
879			strncat(path, ras_block_str(j), sizeof(path) -  strlen(path));
880			strncat(path, "_err_count", sizeof(path) - strlen(path));
881
882			ret = is_file_ok(path, O_RDONLY);
883			CU_ASSERT_EQUAL(ret, 0);
884
885			snprintf(path, sizeof(path), "%s", get_ras_debugfs_root());
886			strncat(path, ras_block_str(j), sizeof(path) - strlen(path));
887			strncat(path, "_err_inject", sizeof(path) - strlen(path));
888
889			ret = is_file_ok(path, O_WRONLY);
890			CU_ASSERT_EQUAL(ret, 0);
891		}
892	}
893}
894
895CU_TestInfo ras_tests[] = {
896	{ "ras basic test",	amdgpu_ras_basic_test },
897	{ "ras query test",	amdgpu_ras_query_test },
898	{ "ras inject test",	amdgpu_ras_inject_test },
899	{ "ras disable test",	amdgpu_ras_disable_test },
900	{ "ras enable test",	amdgpu_ras_enable_test },
901	CU_TEST_INFO_NULL,
902};
903
904CU_BOOL suite_ras_tests_enable(void)
905{
906	amdgpu_device_handle device_handle;
907	uint32_t  major_version;
908	uint32_t  minor_version;
909	int i;
910	drmDevicePtr device;
911
912	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
913		if (amdgpu_device_initialize(drm_amdgpu[i], &major_version,
914					&minor_version, &device_handle))
915			continue;
916
917		if (drmGetDevice2(drm_amdgpu[i],
918					DRM_DEVICE_GET_PCI_REVISION,
919					&device))
920			continue;
921
922		if (device->bustype == DRM_BUS_PCI &&
923				amdgpu_ras_lookup_capability(device_handle)) {
924			amdgpu_device_deinitialize(device_handle);
925			return CU_TRUE;
926		}
927
928		if (amdgpu_device_deinitialize(device_handle))
929			continue;
930	}
931
932	return CU_FALSE;
933}
934
935int suite_ras_tests_init(void)
936{
937	drmDevicePtr device;
938	amdgpu_device_handle device_handle;
939	uint32_t  major_version;
940	uint32_t  minor_version;
941	uint32_t  capability;
942	struct ras_test_mask test_mask;
943	int id;
944	int i;
945	int r;
946
947	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
948		r = amdgpu_device_initialize(drm_amdgpu[i], &major_version,
949				&minor_version, &device_handle);
950		if (r)
951			continue;
952
953		if (drmGetDevice2(drm_amdgpu[i],
954					DRM_DEVICE_GET_PCI_REVISION,
955					&device)) {
956			amdgpu_device_deinitialize(device_handle);
957			continue;
958		}
959
960		if (device->bustype != DRM_BUS_PCI) {
961			amdgpu_device_deinitialize(device_handle);
962			continue;
963		}
964
965		capability = amdgpu_ras_lookup_capability(device_handle);
966		if (capability == 0) {
967			amdgpu_device_deinitialize(device_handle);
968			continue;
969
970		}
971
972		id = amdgpu_ras_lookup_id(device);
973		if (id == -1) {
974			amdgpu_device_deinitialize(device_handle);
975			continue;
976		}
977
978		test_mask = amdgpu_ras_get_test_mask(device);
979
980		devices[devices_count++] = (struct amdgpu_ras_data) {
981			device_handle, id, capability, test_mask,
982		};
983	}
984
985	if (devices_count == 0)
986		return CUE_SINIT_FAILED;
987
988	return CUE_SUCCESS;
989}
990
991int suite_ras_tests_clean(void)
992{
993	int r;
994	int i;
995	int ret = CUE_SUCCESS;
996
997	for (i = 0; i < devices_count; i++) {
998		r = amdgpu_device_deinitialize(devices[i].device_handle);
999		if (r)
1000			ret = CUE_SCLEAN_FAILED;
1001	}
1002	return ret;
1003}
1004