Home | History | Annotate | Line # | Download | only in amdgpu
      1 /*	$NetBSD: amdgpu_umc.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright 2019 Advanced Micro Devices, Inc.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     22  * OTHER DEALINGS IN THE SOFTWARE.
     23  *
     24  */
     25 
     26 #include <sys/cdefs.h>
     27 __KERNEL_RCSID(0, "$NetBSD: amdgpu_umc.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $");
     28 
     29 #include "amdgpu_ras.h"
     30 
     31 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
     32 {
     33 	int r;
     34 	struct ras_fs_if fs_info = {
     35 		.sysfs_name = "umc_err_count",
     36 		.debugfs_name = "umc_err_inject",
     37 	};
     38 	struct ras_ih_if ih_info = {
     39 		.cb = amdgpu_umc_process_ras_data_cb,
     40 	};
     41 
     42 	if (!adev->umc.ras_if) {
     43 		adev->umc.ras_if =
     44 			kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
     45 		if (!adev->umc.ras_if)
     46 			return -ENOMEM;
     47 		adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
     48 		adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
     49 		adev->umc.ras_if->sub_block_index = 0;
     50 		strcpy(adev->umc.ras_if->name, "umc");
     51 	}
     52 	ih_info.head = fs_info.head = *adev->umc.ras_if;
     53 
     54 	r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
     55 				 &fs_info, &ih_info);
     56 	if (r)
     57 		goto free;
     58 
     59 	if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
     60 		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
     61 		if (r)
     62 			goto late_fini;
     63 	} else {
     64 		r = 0;
     65 		goto free;
     66 	}
     67 
     68 	/* ras init of specific umc version */
     69 	if (adev->umc.funcs && adev->umc.funcs->err_cnt_init)
     70 		adev->umc.funcs->err_cnt_init(adev);
     71 
     72 	return 0;
     73 
     74 late_fini:
     75 	amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
     76 free:
     77 	kfree(adev->umc.ras_if);
     78 	adev->umc.ras_if = NULL;
     79 	return r;
     80 }
     81 
     82 void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
     83 {
     84 	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
     85 			adev->umc.ras_if) {
     86 		struct ras_common_if *ras_if = adev->umc.ras_if;
     87 		struct ras_ih_if ih_info = {
     88 			.head = *ras_if,
     89 			.cb = amdgpu_umc_process_ras_data_cb,
     90 		};
     91 
     92 		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
     93 		kfree(ras_if);
     94 	}
     95 }
     96 
     97 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
     98 		void *ras_error_status,
     99 		struct amdgpu_iv_entry *entry)
    100 {
    101 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
    102 
    103 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
    104 	if (adev->umc.funcs &&
    105 	    adev->umc.funcs->query_ras_error_count)
    106 	    adev->umc.funcs->query_ras_error_count(adev, ras_error_status);
    107 
    108 	if (adev->umc.funcs &&
    109 	    adev->umc.funcs->query_ras_error_address &&
    110 	    adev->umc.max_ras_err_cnt_per_query) {
    111 		err_data->err_addr =
    112 			kcalloc(adev->umc.max_ras_err_cnt_per_query,
    113 				sizeof(struct eeprom_table_record), GFP_KERNEL);
    114 
    115 		/* still call query_ras_error_address to clear error status
    116 		 * even NOMEM error is encountered
    117 		 */
    118 		if(!err_data->err_addr)
    119 			DRM_WARN("Failed to alloc memory for umc error address record!\n");
    120 
    121 		/* umc query_ras_error_address is also responsible for clearing
    122 		 * error status
    123 		 */
    124 		adev->umc.funcs->query_ras_error_address(adev, ras_error_status);
    125 	}
    126 
    127 	/* only uncorrectable error needs gpu reset */
    128 	if (err_data->ue_count) {
    129 		if (err_data->err_addr_cnt &&
    130 		    amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
    131 						err_data->err_addr_cnt))
    132 			DRM_WARN("Failed to add ras bad page!\n");
    133 
    134 		amdgpu_ras_reset_gpu(adev);
    135 	}
    136 
    137 	kfree(err_data->err_addr);
    138 	return AMDGPU_RAS_SUCCESS;
    139 }
    140 
    141 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
    142 		struct amdgpu_irq_src *source,
    143 		struct amdgpu_iv_entry *entry)
    144 {
    145 	struct ras_common_if *ras_if = adev->umc.ras_if;
    146 	struct ras_dispatch_if ih_data = {
    147 		.entry = entry,
    148 	};
    149 
    150 	if (!ras_if)
    151 		return 0;
    152 
    153 	ih_data.head = *ras_if;
    154 
    155 	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
    156 	return 0;
    157 }
    158