11.3Sriastrad/*	$NetBSD: amdgpu_xgmi.c,v 1.3 2021/12/19 10:59:01 riastradh Exp $	*/
21.1Sriastrad
31.1Sriastrad/*
41.1Sriastrad * Copyright 2018 Advanced Micro Devices, Inc.
51.1Sriastrad *
61.1Sriastrad * Permission is hereby granted, free of charge, to any person obtaining a
71.1Sriastrad * copy of this software and associated documentation files (the "Software"),
81.1Sriastrad * to deal in the Software without restriction, including without limitation
91.1Sriastrad * the rights to use, copy, modify, merge, publish, distribute, sublicense,
101.1Sriastrad * and/or sell copies of the Software, and to permit persons to whom the
111.1Sriastrad * Software is furnished to do so, subject to the following conditions:
121.1Sriastrad *
131.1Sriastrad * The above copyright notice and this permission notice shall be included in
141.1Sriastrad * all copies or substantial portions of the Software.
151.1Sriastrad *
161.1Sriastrad * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
171.1Sriastrad * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
181.1Sriastrad * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
191.1Sriastrad * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
201.1Sriastrad * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
211.1Sriastrad * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
221.1Sriastrad * OTHER DEALINGS IN THE SOFTWARE.
231.1Sriastrad *
241.1Sriastrad *
251.1Sriastrad */
261.1Sriastrad#include <sys/cdefs.h>
271.3Sriastrad__KERNEL_RCSID(0, "$NetBSD: amdgpu_xgmi.c,v 1.3 2021/12/19 10:59:01 riastradh Exp $");
281.1Sriastrad
291.1Sriastrad#include <linux/list.h>
301.1Sriastrad#include "amdgpu.h"
311.1Sriastrad#include "amdgpu_xgmi.h"
321.1Sriastrad#include "amdgpu_smu.h"
331.1Sriastrad#include "amdgpu_ras.h"
341.1Sriastrad#include "df/df_3_6_offset.h"
351.1Sriastrad
361.1Sriastradstatic DEFINE_MUTEX(xgmi_mutex);
371.1Sriastrad
381.1Sriastrad#define AMDGPU_MAX_XGMI_HIVE			8
391.1Sriastrad#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE		4
401.1Sriastrad
411.1Sriastradstatic struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
421.1Sriastradstatic unsigned hive_count = 0;
431.1Sriastrad
441.1Sriastradvoid *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
451.1Sriastrad{
461.1Sriastrad	return &hive->device_list;
471.1Sriastrad}
481.1Sriastrad
491.1Sriastrad/**
501.1Sriastrad * DOC: AMDGPU XGMI Support
511.1Sriastrad *
521.1Sriastrad * XGMI is a high speed interconnect that joins multiple GPU cards
531.1Sriastrad * into a homogeneous memory space that is organized by a collective
541.1Sriastrad * hive ID and individual node IDs, both of which are 64-bit numbers.
551.1Sriastrad *
561.1Sriastrad * The file xgmi_device_id contains the unique per GPU device ID and
571.1Sriastrad * is stored in the /sys/class/drm/card${cardno}/device/ directory.
581.1Sriastrad *
591.1Sriastrad * Inside the device directory a sub-directory 'xgmi_hive_info' is
601.1Sriastrad * created which contains the hive ID and the list of nodes.
611.1Sriastrad *
621.1Sriastrad * The hive ID is stored in:
631.1Sriastrad *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
641.1Sriastrad *
651.1Sriastrad * The node information is stored in numbered directories:
661.1Sriastrad *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
671.1Sriastrad *
681.1Sriastrad * Each device has their own xgmi_hive_info direction with a mirror
691.1Sriastrad * set of node sub-directories.
701.1Sriastrad *
711.1Sriastrad * The XGMI memory space is built by contiguously adding the power of
721.1Sriastrad * two padded VRAM space from each node to each other.
731.1Sriastrad *
741.1Sriastrad */
751.1Sriastrad
761.1Sriastrad
771.1Sriastradstatic ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
781.1Sriastrad		struct device_attribute *attr, char *buf)
791.1Sriastrad{
801.1Sriastrad	struct amdgpu_hive_info *hive =
811.1Sriastrad			container_of(attr, struct amdgpu_hive_info, dev_attr);
821.1Sriastrad
831.1Sriastrad	return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
841.1Sriastrad}
851.1Sriastrad
861.1Sriastradstatic int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
871.1Sriastrad				    struct amdgpu_hive_info *hive)
881.1Sriastrad{
891.3Sriastrad#ifdef CONFIG_SYSFS
901.1Sriastrad	int ret = 0;
911.1Sriastrad
921.1Sriastrad	if (WARN_ON(hive->kobj))
931.1Sriastrad		return -EINVAL;
941.1Sriastrad
951.1Sriastrad	hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
961.1Sriastrad	if (!hive->kobj) {
971.1Sriastrad		dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
981.1Sriastrad		return -EINVAL;
991.1Sriastrad	}
1001.1Sriastrad
1011.1Sriastrad	hive->dev_attr = (struct device_attribute) {
1021.1Sriastrad		.attr = {
1031.1Sriastrad			.name = "xgmi_hive_id",
1041.1Sriastrad			.mode = S_IRUGO,
1051.1Sriastrad
1061.1Sriastrad		},
1071.1Sriastrad		.show = amdgpu_xgmi_show_hive_id,
1081.1Sriastrad	};
1091.1Sriastrad
1101.1Sriastrad	ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
1111.1Sriastrad	if (ret) {
1121.1Sriastrad		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
1131.1Sriastrad		kobject_del(hive->kobj);
1141.1Sriastrad		kobject_put(hive->kobj);
1151.1Sriastrad		hive->kobj = NULL;
1161.1Sriastrad	}
1171.1Sriastrad
1181.1Sriastrad	return ret;
1191.3Sriastrad#endif
1201.1Sriastrad}
1211.1Sriastrad
1221.1Sriastradstatic void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
1231.1Sriastrad				    struct amdgpu_hive_info *hive)
1241.1Sriastrad{
1251.3Sriastrad#ifdef CONFIG_SYSFS
1261.1Sriastrad	sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
1271.1Sriastrad	kobject_del(hive->kobj);
1281.1Sriastrad	kobject_put(hive->kobj);
1291.1Sriastrad	hive->kobj = NULL;
1301.3Sriastrad#endif
1311.1Sriastrad}
1321.1Sriastrad
1331.1Sriastradstatic ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
1341.1Sriastrad				     struct device_attribute *attr,
1351.1Sriastrad				     char *buf)
1361.1Sriastrad{
1371.1Sriastrad	struct drm_device *ddev = dev_get_drvdata(dev);
1381.1Sriastrad	struct amdgpu_device *adev = ddev->dev_private;
1391.1Sriastrad
1401.1Sriastrad	return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
1411.1Sriastrad
1421.1Sriastrad}
1431.1Sriastrad
1441.1Sriastrad#define AMDGPU_XGMI_SET_FICAA(o)	((o) | 0x456801)
1451.1Sriastradstatic ssize_t amdgpu_xgmi_show_error(struct device *dev,
1461.1Sriastrad				      struct device_attribute *attr,
1471.1Sriastrad				      char *buf)
1481.1Sriastrad{
1491.1Sriastrad	struct drm_device *ddev = dev_get_drvdata(dev);
1501.1Sriastrad	struct amdgpu_device *adev = ddev->dev_private;
1511.1Sriastrad	uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
1521.1Sriastrad	uint64_t fica_out;
1531.1Sriastrad	unsigned int error_count = 0;
1541.1Sriastrad
1551.1Sriastrad	ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
1561.1Sriastrad	ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
1571.1Sriastrad
1581.1Sriastrad	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
1591.1Sriastrad	if (fica_out != 0x1f)
1601.1Sriastrad		pr_err("xGMI error counters not enabled!\n");
1611.1Sriastrad
1621.1Sriastrad	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
1631.1Sriastrad
1641.1Sriastrad	if ((fica_out & 0xffff) == 2)
1651.1Sriastrad		error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
1661.1Sriastrad
1671.1Sriastrad	adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
1681.1Sriastrad
1691.1Sriastrad	return snprintf(buf, PAGE_SIZE, "%d\n", error_count);
1701.1Sriastrad}
1711.1Sriastrad
1721.1Sriastrad
1731.1Sriastradstatic DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
1741.1Sriastradstatic DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
1751.1Sriastrad
1761.1Sriastradstatic int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
1771.1Sriastrad					 struct amdgpu_hive_info *hive)
1781.1Sriastrad{
1791.3Sriastrad#ifdef CONFIG_SYSFS
1801.1Sriastrad	int ret = 0;
1811.1Sriastrad	char node[10] = { 0 };
1821.1Sriastrad
1831.1Sriastrad	/* Create xgmi device id file */
1841.1Sriastrad	ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
1851.1Sriastrad	if (ret) {
1861.1Sriastrad		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
1871.1Sriastrad		return ret;
1881.1Sriastrad	}
1891.1Sriastrad
1901.1Sriastrad	/* Create xgmi error file */
1911.1Sriastrad	ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
1921.1Sriastrad	if (ret)
1931.1Sriastrad		pr_err("failed to create xgmi_error\n");
1941.1Sriastrad
1951.1Sriastrad
1961.1Sriastrad	/* Create sysfs link to hive info folder on the first device */
1971.1Sriastrad	if (adev != hive->adev) {
1981.1Sriastrad		ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
1991.1Sriastrad					"xgmi_hive_info");
2001.1Sriastrad		if (ret) {
2011.1Sriastrad			dev_err(adev->dev, "XGMI: Failed to create link to hive info");
2021.1Sriastrad			goto remove_file;
2031.1Sriastrad		}
2041.1Sriastrad	}
2051.1Sriastrad
2061.1Sriastrad	sprintf(node, "node%d", hive->number_devices);
2071.1Sriastrad	/* Create sysfs link form the hive folder to yourself */
2081.1Sriastrad	ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
2091.1Sriastrad	if (ret) {
2101.1Sriastrad		dev_err(adev->dev, "XGMI: Failed to create link from hive info");
2111.1Sriastrad		goto remove_link;
2121.1Sriastrad	}
2131.1Sriastrad
2141.1Sriastrad	goto success;
2151.1Sriastrad
2161.1Sriastrad
2171.1Sriastradremove_link:
2181.1Sriastrad	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
2191.1Sriastrad
2201.1Sriastradremove_file:
2211.1Sriastrad	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
2221.1Sriastrad
2231.1Sriastradsuccess:
2241.1Sriastrad	return ret;
2251.3Sriastrad#endif
2261.1Sriastrad}
2271.1Sriastrad
2281.1Sriastradstatic void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
2291.1Sriastrad					  struct amdgpu_hive_info *hive)
2301.1Sriastrad{
2311.3Sriastrad#ifdef CONFIG_SYSFS
2321.1Sriastrad	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
2331.1Sriastrad	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
2341.1Sriastrad	sysfs_remove_link(hive->kobj, adev->ddev->unique);
2351.3Sriastrad#endif
2361.1Sriastrad}
2371.1Sriastrad
2381.1Sriastrad
2391.1Sriastrad
2401.1Sriastradstruct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
2411.1Sriastrad{
2421.1Sriastrad	int i;
2431.1Sriastrad	struct amdgpu_hive_info *tmp;
2441.1Sriastrad
2451.1Sriastrad	if (!adev->gmc.xgmi.hive_id)
2461.1Sriastrad		return NULL;
2471.1Sriastrad
2481.1Sriastrad	mutex_lock(&xgmi_mutex);
2491.1Sriastrad
2501.1Sriastrad	for (i = 0 ; i < hive_count; ++i) {
2511.1Sriastrad		tmp = &xgmi_hives[i];
2521.1Sriastrad		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
2531.1Sriastrad			if (lock)
2541.1Sriastrad				mutex_lock(&tmp->hive_lock);
2551.1Sriastrad			mutex_unlock(&xgmi_mutex);
2561.1Sriastrad			return tmp;
2571.1Sriastrad		}
2581.1Sriastrad	}
2591.1Sriastrad	if (i >= AMDGPU_MAX_XGMI_HIVE) {
2601.1Sriastrad		mutex_unlock(&xgmi_mutex);
2611.1Sriastrad		return NULL;
2621.1Sriastrad	}
2631.1Sriastrad
2641.1Sriastrad	/* initialize new hive if not exist */
2651.1Sriastrad	tmp = &xgmi_hives[hive_count++];
2661.1Sriastrad
2671.1Sriastrad	if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
2681.1Sriastrad		mutex_unlock(&xgmi_mutex);
2691.1Sriastrad		return NULL;
2701.1Sriastrad	}
2711.1Sriastrad
2721.1Sriastrad	tmp->adev = adev;
2731.1Sriastrad	tmp->hive_id = adev->gmc.xgmi.hive_id;
2741.1Sriastrad	INIT_LIST_HEAD(&tmp->device_list);
2751.1Sriastrad	mutex_init(&tmp->hive_lock);
2761.1Sriastrad	mutex_init(&tmp->reset_lock);
2771.1Sriastrad	task_barrier_init(&tmp->tb);
2781.1Sriastrad
2791.1Sriastrad	if (lock)
2801.1Sriastrad		mutex_lock(&tmp->hive_lock);
2811.1Sriastrad	tmp->pstate = -1;
2821.1Sriastrad	mutex_unlock(&xgmi_mutex);
2831.1Sriastrad
2841.1Sriastrad	return tmp;
2851.1Sriastrad}
2861.1Sriastrad
2871.1Sriastradint amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
2881.1Sriastrad{
2891.1Sriastrad	int ret = 0;
2901.1Sriastrad	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2911.1Sriastrad	struct amdgpu_device *tmp_adev;
2921.1Sriastrad	bool update_hive_pstate = true;
2931.1Sriastrad	bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20;
2941.1Sriastrad
2951.1Sriastrad	if (!hive)
2961.1Sriastrad		return 0;
2971.1Sriastrad
2981.1Sriastrad	mutex_lock(&hive->hive_lock);
2991.1Sriastrad
3001.1Sriastrad	if (hive->pstate == pstate) {
3011.1Sriastrad		adev->pstate = is_high_pstate ? pstate : adev->pstate;
3021.1Sriastrad		goto out;
3031.1Sriastrad	}
3041.1Sriastrad
3051.1Sriastrad	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
3061.1Sriastrad
3071.1Sriastrad	ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate);
3081.1Sriastrad	if (ret) {
3091.1Sriastrad		dev_err(adev->dev,
3101.1Sriastrad			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
3111.1Sriastrad			adev->gmc.xgmi.node_id,
3121.1Sriastrad			adev->gmc.xgmi.hive_id, ret);
3131.1Sriastrad		goto out;
3141.1Sriastrad	}
3151.1Sriastrad
3161.1Sriastrad	/* Update device pstate */
3171.1Sriastrad	adev->pstate = pstate;
3181.1Sriastrad
3191.1Sriastrad	/*
3201.1Sriastrad	 * Update the hive pstate only all devices of the hive
3211.1Sriastrad	 * are in the same pstate
3221.1Sriastrad	 */
3231.1Sriastrad	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
3241.1Sriastrad		if (tmp_adev->pstate != adev->pstate) {
3251.1Sriastrad			update_hive_pstate = false;
3261.1Sriastrad			break;
3271.1Sriastrad		}
3281.1Sriastrad	}
3291.1Sriastrad	if (update_hive_pstate || is_high_pstate)
3301.1Sriastrad		hive->pstate = pstate;
3311.1Sriastrad
3321.1Sriastradout:
3331.1Sriastrad	mutex_unlock(&hive->hive_lock);
3341.1Sriastrad
3351.1Sriastrad	return ret;
3361.1Sriastrad}
3371.1Sriastrad
3381.1Sriastradint amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
3391.1Sriastrad{
3401.1Sriastrad	int ret = -EINVAL;
3411.1Sriastrad
3421.1Sriastrad	/* Each psp need to set the latest topology */
3431.1Sriastrad	ret = psp_xgmi_set_topology_info(&adev->psp,
3441.1Sriastrad					 hive->number_devices,
3451.1Sriastrad					 &adev->psp.xgmi_context.top_info);
3461.1Sriastrad	if (ret)
3471.1Sriastrad		dev_err(adev->dev,
3481.1Sriastrad			"XGMI: Set topology failure on device %llx, hive %llx, ret %d",
3491.1Sriastrad			adev->gmc.xgmi.node_id,
3501.1Sriastrad			adev->gmc.xgmi.hive_id, ret);
3511.1Sriastrad
3521.1Sriastrad	return ret;
3531.1Sriastrad}
3541.1Sriastrad
3551.1Sriastrad
3561.1Sriastradint amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
3571.1Sriastrad		struct amdgpu_device *peer_adev)
3581.1Sriastrad{
3591.1Sriastrad	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
3601.1Sriastrad	int i;
3611.1Sriastrad
3621.1Sriastrad	for (i = 0 ; i < top->num_nodes; ++i)
3631.1Sriastrad		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
3641.1Sriastrad			return top->nodes[i].num_hops;
3651.1Sriastrad	return	-EINVAL;
3661.1Sriastrad}
3671.1Sriastrad
3681.1Sriastradint amdgpu_xgmi_add_device(struct amdgpu_device *adev)
3691.1Sriastrad{
3701.1Sriastrad	struct psp_xgmi_topology_info *top_info;
3711.1Sriastrad	struct amdgpu_hive_info *hive;
3721.1Sriastrad	struct amdgpu_xgmi	*entry;
3731.1Sriastrad	struct amdgpu_device *tmp_adev = NULL;
3741.1Sriastrad
3751.1Sriastrad	int count = 0, ret = 0;
3761.1Sriastrad
3771.1Sriastrad	if (!adev->gmc.xgmi.supported)
3781.1Sriastrad		return 0;
3791.1Sriastrad
3801.1Sriastrad	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
3811.1Sriastrad		ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
3821.1Sriastrad		if (ret) {
3831.1Sriastrad			dev_err(adev->dev,
3841.1Sriastrad				"XGMI: Failed to get hive id\n");
3851.1Sriastrad			return ret;
3861.1Sriastrad		}
3871.1Sriastrad
3881.1Sriastrad		ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
3891.1Sriastrad		if (ret) {
3901.1Sriastrad			dev_err(adev->dev,
3911.1Sriastrad				"XGMI: Failed to get node id\n");
3921.1Sriastrad			return ret;
3931.1Sriastrad		}
3941.1Sriastrad	} else {
3951.1Sriastrad		adev->gmc.xgmi.hive_id = 16;
3961.1Sriastrad		adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
3971.1Sriastrad	}
3981.1Sriastrad
3991.1Sriastrad	hive = amdgpu_get_xgmi_hive(adev, 1);
4001.1Sriastrad	if (!hive) {
4011.1Sriastrad		ret = -EINVAL;
4021.1Sriastrad		dev_err(adev->dev,
4031.1Sriastrad			"XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
4041.1Sriastrad			adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
4051.1Sriastrad		goto exit;
4061.1Sriastrad	}
4071.1Sriastrad
4081.1Sriastrad	/* Set default device pstate */
4091.1Sriastrad	adev->pstate = -1;
4101.1Sriastrad
4111.1Sriastrad	top_info = &adev->psp.xgmi_context.top_info;
4121.1Sriastrad
4131.1Sriastrad	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
4141.1Sriastrad	list_for_each_entry(entry, &hive->device_list, head)
4151.1Sriastrad		top_info->nodes[count++].node_id = entry->node_id;
4161.1Sriastrad	top_info->num_nodes = count;
4171.1Sriastrad	hive->number_devices = count;
4181.1Sriastrad
4191.1Sriastrad	task_barrier_add_task(&hive->tb);
4201.1Sriastrad
4211.1Sriastrad	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
4221.1Sriastrad		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4231.1Sriastrad			/* update node list for other device in the hive */
4241.1Sriastrad			if (tmp_adev != adev) {
4251.1Sriastrad				top_info = &tmp_adev->psp.xgmi_context.top_info;
4261.1Sriastrad				top_info->nodes[count - 1].node_id =
4271.1Sriastrad					adev->gmc.xgmi.node_id;
4281.1Sriastrad				top_info->num_nodes = count;
4291.1Sriastrad			}
4301.1Sriastrad			ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
4311.1Sriastrad			if (ret)
4321.1Sriastrad				goto exit;
4331.1Sriastrad		}
4341.1Sriastrad
4351.1Sriastrad		/* get latest topology info for each device from psp */
4361.1Sriastrad		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4371.1Sriastrad			ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
4381.1Sriastrad					&tmp_adev->psp.xgmi_context.top_info);
4391.1Sriastrad			if (ret) {
4401.1Sriastrad				dev_err(tmp_adev->dev,
4411.1Sriastrad					"XGMI: Get topology failure on device %llx, hive %llx, ret %d",
4421.1Sriastrad					tmp_adev->gmc.xgmi.node_id,
4431.1Sriastrad					tmp_adev->gmc.xgmi.hive_id, ret);
4441.1Sriastrad				/* To do : continue with some node failed or disable the whole hive */
4451.1Sriastrad				goto exit;
4461.1Sriastrad			}
4471.1Sriastrad		}
4481.1Sriastrad	}
4491.1Sriastrad
4501.1Sriastrad	if (!ret)
4511.1Sriastrad		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
4521.1Sriastrad
4531.1Sriastrad
4541.1Sriastrad	mutex_unlock(&hive->hive_lock);
4551.1Sriastradexit:
4561.1Sriastrad	if (!ret)
4571.1Sriastrad		dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
4581.1Sriastrad			 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
4591.1Sriastrad	else
4601.1Sriastrad		dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
4611.1Sriastrad			adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
4621.1Sriastrad			ret);
4631.1Sriastrad
4641.1Sriastrad	return ret;
4651.1Sriastrad}
4661.1Sriastrad
4671.1Sriastradvoid amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
4681.1Sriastrad{
4691.1Sriastrad	struct amdgpu_hive_info *hive;
4701.1Sriastrad
4711.1Sriastrad	if (!adev->gmc.xgmi.supported)
4721.1Sriastrad		return;
4731.1Sriastrad
4741.1Sriastrad	hive = amdgpu_get_xgmi_hive(adev, 1);
4751.1Sriastrad	if (!hive)
4761.1Sriastrad		return;
4771.1Sriastrad
4781.1Sriastrad	if (!(hive->number_devices--)) {
4791.1Sriastrad		amdgpu_xgmi_sysfs_destroy(adev, hive);
4801.3Sriastrad		task_barrier_destroy(&tmp->tb);
4811.1Sriastrad		mutex_destroy(&hive->hive_lock);
4821.1Sriastrad		mutex_destroy(&hive->reset_lock);
4831.1Sriastrad	} else {
4841.1Sriastrad		task_barrier_rem_task(&hive->tb);
4851.1Sriastrad		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
4861.1Sriastrad		mutex_unlock(&hive->hive_lock);
4871.1Sriastrad	}
4881.1Sriastrad}
4891.1Sriastrad
4901.1Sriastradint amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
4911.1Sriastrad{
4921.1Sriastrad	int r;
4931.1Sriastrad	struct ras_ih_if ih_info = {
4941.1Sriastrad		.cb = NULL,
4951.1Sriastrad	};
4961.1Sriastrad	struct ras_fs_if fs_info = {
4971.1Sriastrad		.sysfs_name = "xgmi_wafl_err_count",
4981.1Sriastrad		.debugfs_name = "xgmi_wafl_err_inject",
4991.1Sriastrad	};
5001.1Sriastrad
5011.1Sriastrad	if (!adev->gmc.xgmi.supported ||
5021.1Sriastrad	    adev->gmc.xgmi.num_physical_nodes == 0)
5031.1Sriastrad		return 0;
5041.1Sriastrad
5051.1Sriastrad	if (!adev->gmc.xgmi.ras_if) {
5061.1Sriastrad		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
5071.1Sriastrad		if (!adev->gmc.xgmi.ras_if)
5081.1Sriastrad			return -ENOMEM;
5091.1Sriastrad		adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
5101.1Sriastrad		adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
5111.1Sriastrad		adev->gmc.xgmi.ras_if->sub_block_index = 0;
5121.1Sriastrad		strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
5131.1Sriastrad	}
5141.1Sriastrad	ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
5151.1Sriastrad	r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
5161.1Sriastrad				 &fs_info, &ih_info);
5171.1Sriastrad	if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
5181.1Sriastrad		kfree(adev->gmc.xgmi.ras_if);
5191.1Sriastrad		adev->gmc.xgmi.ras_if = NULL;
5201.1Sriastrad	}
5211.1Sriastrad
5221.1Sriastrad	return r;
5231.1Sriastrad}
5241.1Sriastrad
5251.1Sriastradvoid amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
5261.1Sriastrad{
5271.1Sriastrad	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
5281.1Sriastrad			adev->gmc.xgmi.ras_if) {
5291.1Sriastrad		struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
5301.1Sriastrad		struct ras_ih_if ih_info = {
5311.1Sriastrad			.cb = NULL,
5321.1Sriastrad		};
5331.1Sriastrad
5341.1Sriastrad		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
5351.1Sriastrad		kfree(ras_if);
5361.1Sriastrad	}
5371.1Sriastrad}
538