Merge branch 'amdkfd-v6' of git://people.freedesktop.org/~gabbayo/linux into drm-next

Merge AMDKFD it seems clean enough. * 'amdkfd-v6' of git://people.freedesktop.org/~gabbayo/linux: (29 commits) amdkfd: Implement the Get Version IOCTL amdkfd: Implement the Get Process Aperture IOCTL amdkfd: Implement the Get Clock Counters IOCTL amdkfd: Implement the Set Memory Policy IOCTL amdkfd: Implement the create/destroy/update queue IOCTLs amdkfd: Add interrupt handling module amdkfd: Add device queue manager module amdkfd: Add process queue manager module amdkfd: Add packet manager module amdkfd: Add module parameter of scheduling policy amdkfd: Add kernel queue module amdkfd: Add mqd_manager module amdkfd: Add queue module amdkfd: Add binding/unbinding calls to amd_iommu driver amdkfd: Add basic modules to amdkfd amdkfd: Add topology module to amdkfd amdkfd: Add amdkfd skeleton driver amdkfd: Add IOCTL set definitions of amdkfd Update MAINTAINERS and CREDITS files with amdkfd info drm/radeon: Add radeon <--> amdkfd interface ...
2024-11-30 05:06:44 +07:00 · 2014-11-20 14:32:32 +10:00 · 2014-11-20 14:32:32 +10:00 · cc5ac1ca79
commit cc5ac1ca79
parent 8975626ea3 ecd5c9821c
53 changed files with 9855 additions and 172 deletions
--- a/7
+++ b/7
@ -1197,6 +1197,13 @@ S: R. Tocantins, 89 - Cristo Rei
 S: 80050-430 - Curitiba - Paraná
 S: Brazil
 N: Oded Gabbay
 E: oded.gabbay@gmail.com
 D: AMD KFD maintainer
 S: 12 Shraga Raphaeli
 S: Petah-Tikva, 4906418
 S: Israel
 N: Kumar Gala
 E: galak@kernel.crashing.org
 D: Embedded PowerPC 6xx/7xx/74xx/82xx/83xx/85xx support
--- a/10
+++ b/10
@ -618,6 +618,16 @@ S:	Maintained
 F:	drivers/iommu/amd_iommu*.[ch]
 F:	include/linux/amd-iommu.h
 AMD KFD
 M:      Oded Gabbay <oded.gabbay@amd.com>
 L:      dri-devel@lists.freedesktop.org
 T:      git git://people.freedesktop.org/~gabbayo/linux.git
 S:      Supported
 F:      drivers/gpu/drm/amd/amdkfd/
 F:      drivers/gpu/drm/radeon/radeon_kfd.c
 F:      drivers/gpu/drm/radeon/radeon_kfd.h
 F:      include/uapi/linux/kfd_ioctl.h
 AMD MICROCODE UPDATE SUPPORT
 M:	Andreas Herrmann <herrmann.der.user@googlemail.com>
 L:	amd64-microcode@amd64.org
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@ -200,3 +200,5 @@ source "drivers/gpu/drm/tegra/Kconfig"
 source "drivers/gpu/drm/panel/Kconfig"
 source "drivers/gpu/drm/sti/Kconfig"
 source "drivers/gpu/drm/amd/amdkfd/Kconfig"
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@ -65,3 +65,4 @@ obj-$(CONFIG_DRM_STI) += sti/
 obj-y			+= i2c/
 obj-y			+= panel/
 obj-y			+= bridge/
 obj-$(CONFIG_HSA_AMD) += amd/amdkfd/
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@ -0,0 +1,9 @@
 #
 # Heterogenous system architecture configuration
 #
 config HSA_AMD
 	tristate "HSA kernel driver for AMD GPU devices"
 	depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64
 	help
 	  Enable this if you want to use HSA features on AMD GPU devices.
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@ -0,0 +1,14 @@
 #
 # Makefile for Heterogenous System Architecture support for AMD GPU devices
 #
 ccflags-y := -Iinclude/drm -Idrivers/gpu/drm/amd/include/
 amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
 		kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
 		kfd_process.o kfd_queue.o kfd_mqd_manager.o \
 		kfd_kernel_queue.o kfd_packet_manager.o \
 		kfd_process_queue_manager.o kfd_device_queue_manager.o \
 		kfd_interrupt.o
 obj-$(CONFIG_HSA_AMD)	+= amdkfd.o
--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
@ -0,0 +1,221 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef CIK_REGS_H
 #define CIK_REGS_H
 #define IH_VMID_0_LUT					0x3D40u
 #define BIF_DOORBELL_CNTL				0x530Cu
 #define	SRBM_GFX_CNTL					0xE44
 #define	PIPEID(x)					((x) << 0)
 #define	MEID(x)						((x) << 2)
 #define	VMID(x)						((x) << 4)
 #define	QUEUEID(x)					((x) << 8)
 #define	SQ_CONFIG					0x8C00
 #define	SH_MEM_BASES					0x8C28
 /* if PTR32, these are the bases for scratch and lds */
 #define	PRIVATE_BASE(x)					((x) << 0) /* scratch */
 #define	SHARED_BASE(x)					((x) << 16) /* LDS */
 #define	SH_MEM_APE1_BASE				0x8C2C
 /* if PTR32, this is the base location of GPUVM */
 #define	SH_MEM_APE1_LIMIT				0x8C30
 /* if PTR32, this is the upper limit of GPUVM */
 #define	SH_MEM_CONFIG					0x8C34
 #define	PTR32						(1 << 0)
 #define PRIVATE_ATC					(1 << 1)
 #define	ALIGNMENT_MODE(x)				((x) << 2)
 #define	SH_MEM_ALIGNMENT_MODE_DWORD			0
 #define	SH_MEM_ALIGNMENT_MODE_DWORD_STRICT		1
 #define	SH_MEM_ALIGNMENT_MODE_STRICT			2
 #define	SH_MEM_ALIGNMENT_MODE_UNALIGNED			3
 #define	DEFAULT_MTYPE(x)				((x) << 4)
 #define	APE1_MTYPE(x)					((x) << 7)
 /* valid for both DEFAULT_MTYPE and APE1_MTYPE */
 #define	MTYPE_CACHED					0
 #define	MTYPE_NONCACHED					3
 #define SH_STATIC_MEM_CONFIG				0x9604u
 #define	TC_CFG_L1_LOAD_POLICY0				0xAC68
 #define	TC_CFG_L1_LOAD_POLICY1				0xAC6C
 #define	TC_CFG_L1_STORE_POLICY				0xAC70
 #define	TC_CFG_L2_LOAD_POLICY0				0xAC74
 #define	TC_CFG_L2_LOAD_POLICY1				0xAC78
 #define	TC_CFG_L2_STORE_POLICY0				0xAC7C
 #define	TC_CFG_L2_STORE_POLICY1				0xAC80
 #define	TC_CFG_L2_ATOMIC_POLICY				0xAC84
 #define	TC_CFG_L1_VOLATILE				0xAC88
 #define	TC_CFG_L2_VOLATILE				0xAC8C
 #define CP_PQ_WPTR_POLL_CNTL				0xC20C
 #define	WPTR_POLL_EN					(1 << 31)
 #define CPC_INT_CNTL					0xC2D0
 #define CP_ME1_PIPE0_INT_CNTL				0xC214
 #define CP_ME1_PIPE1_INT_CNTL				0xC218
 #define CP_ME1_PIPE2_INT_CNTL				0xC21C
 #define CP_ME1_PIPE3_INT_CNTL				0xC220
 #define CP_ME2_PIPE0_INT_CNTL				0xC224
 #define CP_ME2_PIPE1_INT_CNTL				0xC228
 #define CP_ME2_PIPE2_INT_CNTL				0xC22C
 #define CP_ME2_PIPE3_INT_CNTL				0xC230
 #define DEQUEUE_REQUEST_INT_ENABLE			(1 << 13)
 #define WRM_POLL_TIMEOUT_INT_ENABLE			(1 << 17)
 #define PRIV_REG_INT_ENABLE				(1 << 23)
 #define TIME_STAMP_INT_ENABLE				(1 << 26)
 #define GENERIC2_INT_ENABLE				(1 << 29)
 #define GENERIC1_INT_ENABLE				(1 << 30)
 #define GENERIC0_INT_ENABLE				(1 << 31)
 #define CP_ME1_PIPE0_INT_STATUS				0xC214
 #define CP_ME1_PIPE1_INT_STATUS				0xC218
 #define CP_ME1_PIPE2_INT_STATUS				0xC21C
 #define CP_ME1_PIPE3_INT_STATUS				0xC220
 #define CP_ME2_PIPE0_INT_STATUS				0xC224
 #define CP_ME2_PIPE1_INT_STATUS				0xC228
 #define CP_ME2_PIPE2_INT_STATUS				0xC22C
 #define CP_ME2_PIPE3_INT_STATUS				0xC230
 #define DEQUEUE_REQUEST_INT_STATUS			(1 << 13)
 #define WRM_POLL_TIMEOUT_INT_STATUS			(1 << 17)
 #define PRIV_REG_INT_STATUS				(1 << 23)
 #define TIME_STAMP_INT_STATUS				(1 << 26)
 #define GENERIC2_INT_STATUS				(1 << 29)
 #define GENERIC1_INT_STATUS				(1 << 30)
 #define GENERIC0_INT_STATUS				(1 << 31)
 #define CP_HPD_EOP_BASE_ADDR				0xC904
 #define CP_HPD_EOP_BASE_ADDR_HI				0xC908
 #define CP_HPD_EOP_VMID					0xC90C
 #define CP_HPD_EOP_CONTROL				0xC910
 #define	EOP_SIZE(x)					((x) << 0)
 #define	EOP_SIZE_MASK					(0x3f << 0)
 #define CP_MQD_BASE_ADDR				0xC914
 #define CP_MQD_BASE_ADDR_HI				0xC918
 #define CP_HQD_ACTIVE					0xC91C
 #define CP_HQD_VMID					0xC920
 #define CP_HQD_PERSISTENT_STATE				0xC924u
 #define	DEFAULT_CP_HQD_PERSISTENT_STATE			(0x33U << 8)
 #define	PRELOAD_REQ					(1 << 0)
 #define CP_HQD_PIPE_PRIORITY				0xC928u
 #define CP_HQD_QUEUE_PRIORITY				0xC92Cu
 #define CP_HQD_QUANTUM					0xC930u
 #define	QUANTUM_EN					1U
 #define	QUANTUM_SCALE_1MS				(1U << 4)
 #define	QUANTUM_DURATION(x)				((x) << 8)
 #define CP_HQD_PQ_BASE					0xC934
 #define CP_HQD_PQ_BASE_HI				0xC938
 #define CP_HQD_PQ_RPTR					0xC93C
 #define CP_HQD_PQ_RPTR_REPORT_ADDR			0xC940
 #define CP_HQD_PQ_RPTR_REPORT_ADDR_HI			0xC944
 #define CP_HQD_PQ_WPTR_POLL_ADDR			0xC948
 #define CP_HQD_PQ_WPTR_POLL_ADDR_HI			0xC94C
 #define CP_HQD_PQ_DOORBELL_CONTROL			0xC950
 #define	DOORBELL_OFFSET(x)				((x) << 2)
 #define	DOORBELL_OFFSET_MASK				(0x1fffff << 2)
 #define	DOORBELL_SOURCE					(1 << 28)
 #define	DOORBELL_SCHD_HIT				(1 << 29)
 #define	DOORBELL_EN					(1 << 30)
 #define	DOORBELL_HIT					(1 << 31)
 #define CP_HQD_PQ_WPTR					0xC954
 #define CP_HQD_PQ_CONTROL				0xC958
 #define	QUEUE_SIZE(x)					((x) << 0)
 #define	QUEUE_SIZE_MASK					(0x3f << 0)
 #define	RPTR_BLOCK_SIZE(x)				((x) << 8)
 #define	RPTR_BLOCK_SIZE_MASK				(0x3f << 8)
 #define	MIN_AVAIL_SIZE(x)				((x) << 20)
 #define	PQ_ATC_EN					(1 << 23)
 #define	PQ_VOLATILE					(1 << 26)
 #define	NO_UPDATE_RPTR					(1 << 27)
 #define	UNORD_DISPATCH					(1 << 28)
 #define	ROQ_PQ_IB_FLIP					(1 << 29)
 #define	PRIV_STATE					(1 << 30)
 #define	KMD_QUEUE					(1 << 31)
 #define	DEFAULT_RPTR_BLOCK_SIZE				RPTR_BLOCK_SIZE(5)
 #define	DEFAULT_MIN_AVAIL_SIZE				MIN_AVAIL_SIZE(3)
 #define CP_HQD_IB_BASE_ADDR				0xC95Cu
 #define CP_HQD_IB_BASE_ADDR_HI				0xC960u
 #define CP_HQD_IB_RPTR					0xC964u
 #define CP_HQD_IB_CONTROL				0xC968u
 #define	IB_ATC_EN					(1U << 23)
 #define	DEFAULT_MIN_IB_AVAIL_SIZE			(3U << 20)
 #define CP_HQD_DEQUEUE_REQUEST				0xC974
 #define	DEQUEUE_REQUEST_DRAIN				1
 #define DEQUEUE_REQUEST_RESET				2
 #define		DEQUEUE_INT					(1U << 8)
 #define CP_HQD_SEMA_CMD					0xC97Cu
 #define CP_HQD_MSG_TYPE					0xC980u
 #define CP_HQD_ATOMIC0_PREOP_LO				0xC984u
 #define CP_HQD_ATOMIC0_PREOP_HI				0xC988u
 #define CP_HQD_ATOMIC1_PREOP_LO				0xC98Cu
 #define CP_HQD_ATOMIC1_PREOP_HI				0xC990u
 #define CP_HQD_HQ_SCHEDULER0				0xC994u
 #define CP_HQD_HQ_SCHEDULER1				0xC998u
 #define CP_MQD_CONTROL					0xC99C
 #define	MQD_VMID(x)					((x) << 0)
 #define	MQD_VMID_MASK					(0xf << 0)
 #define	MQD_CONTROL_PRIV_STATE_EN			(1U << 8)
 #define GRBM_GFX_INDEX					0x30800
 #define	INSTANCE_INDEX(x)				((x) << 0)
 #define	SH_INDEX(x)					((x) << 8)
 #define	SE_INDEX(x)					((x) << 16)
 #define	SH_BROADCAST_WRITES				(1 << 29)
 #define	INSTANCE_BROADCAST_WRITES			(1 << 30)
 #define	SE_BROADCAST_WRITES				(1 << 31)
 #define SQC_CACHES					0x30d20
 #define SQC_POLICY					0x8C38u
 #define SQC_VOLATILE					0x8C3Cu
 #define CP_PERFMON_CNTL					0x36020
 #define ATC_VMID0_PASID_MAPPING				0x339Cu
 #define	ATC_VMID_PASID_MAPPING_UPDATE_STATUS		0x3398u
 #define	ATC_VMID_PASID_MAPPING_VALID			(1U << 31)
 #define ATC_VM_APERTURE0_CNTL				0x3310u
 #define	ATS_ACCESS_MODE_NEVER				0
 #define	ATS_ACCESS_MODE_ALWAYS				1
 #define ATC_VM_APERTURE0_CNTL2				0x3318u
 #define ATC_VM_APERTURE0_HIGH_ADDR			0x3308u
 #define ATC_VM_APERTURE0_LOW_ADDR			0x3300u
 #define ATC_VM_APERTURE1_CNTL				0x3314u
 #define ATC_VM_APERTURE1_CNTL2				0x331Cu
 #define ATC_VM_APERTURE1_HIGH_ADDR			0x330Cu
 #define ATC_VM_APERTURE1_LOW_ADDR			0x3304u
 #endif
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@ -0,0 +1,576 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/err.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
 #include <uapi/linux/kfd_ioctl.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <uapi/asm-generic/mman-common.h>
 #include <asm/processor.h>
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
 static int kfd_mmap(struct file *, struct vm_area_struct *);
 static const char kfd_dev_name[] = "kfd";
 static const struct file_operations kfd_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = kfd_ioctl,
 	.compat_ioctl = kfd_ioctl,
 	.open = kfd_open,
 	.mmap = kfd_mmap,
 };
 static int kfd_char_dev_major = -1;
 static struct class *kfd_class;
 struct device *kfd_device;
 int kfd_chardev_init(void)
 {
 	int err = 0;
 	kfd_char_dev_major = register_chrdev(0, kfd_dev_name, &kfd_fops);
 	err = kfd_char_dev_major;
 	if (err < 0)
 		goto err_register_chrdev;
 	kfd_class = class_create(THIS_MODULE, kfd_dev_name);
 	err = PTR_ERR(kfd_class);
 	if (IS_ERR(kfd_class))
 		goto err_class_create;
 	kfd_device = device_create(kfd_class, NULL,
 					MKDEV(kfd_char_dev_major, 0),
 					NULL, kfd_dev_name);
 	err = PTR_ERR(kfd_device);
 	if (IS_ERR(kfd_device))
 		goto err_device_create;
 	return 0;
 err_device_create:
 	class_destroy(kfd_class);
 err_class_create:
 	unregister_chrdev(kfd_char_dev_major, kfd_dev_name);
 err_register_chrdev:
 	return err;
 }
 void kfd_chardev_exit(void)
 {
 	device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0));
 	class_destroy(kfd_class);
 	unregister_chrdev(kfd_char_dev_major, kfd_dev_name);
 }
 struct device *kfd_chardev(void)
 {
 	return kfd_device;
 }
 static int kfd_open(struct inode *inode, struct file *filep)
 {
 	struct kfd_process *process;
 	if (iminor(inode) != 0)
 		return -ENODEV;
 	process = kfd_create_process(current);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 	process->is_32bit_user_mode = is_compat_task();
 	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
 		process->pasid, process->is_32bit_user_mode);
 	kfd_init_apertures(process);
 	return 0;
 }
 static long kfd_ioctl_get_version(struct file *filep, struct kfd_process *p,
 					void __user *arg)
 {
 	struct kfd_ioctl_get_version_args args;
 	int err = 0;
 	args.major_version = KFD_IOCTL_MAJOR_VERSION;
 	args.minor_version = KFD_IOCTL_MINOR_VERSION;
 	if (copy_to_user(arg, &args, sizeof(args)))
 		err = -EFAULT;
 	return err;
 }
 static int set_queue_properties_from_user(struct queue_properties *q_properties,
 				struct kfd_ioctl_create_queue_args *args)
 {
 	if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) {
 		pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n");
 		return -EINVAL;
 	}
 	if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) {
 		pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n");
 		return -EINVAL;
 	}
 	if ((args->ring_base_address) &&
 		(!access_ok(VERIFY_WRITE, args->ring_base_address, sizeof(uint64_t)))) {
 		pr_err("kfd: can't access ring base address\n");
 		return -EFAULT;
 	}
 	if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) {
 		pr_err("kfd: ring size must be a power of 2 or 0\n");
 		return -EINVAL;
 	}
 	if (!access_ok(VERIFY_WRITE, args->read_pointer_address, sizeof(uint32_t))) {
 		pr_err("kfd: can't access read pointer\n");
 		return -EFAULT;
 	}
 	if (!access_ok(VERIFY_WRITE, args->write_pointer_address, sizeof(uint32_t))) {
 		pr_err("kfd: can't access write pointer\n");
 		return -EFAULT;
 	}
 	q_properties->is_interop = false;
 	q_properties->queue_percent = args->queue_percentage;
 	q_properties->priority = args->queue_priority;
 	q_properties->queue_address = args->ring_base_address;
 	q_properties->queue_size = args->ring_size;
 	q_properties->read_ptr = (uint32_t *) args->read_pointer_address;
 	q_properties->write_ptr = (uint32_t *) args->write_pointer_address;
 	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
 		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
 		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
 	else
 		return -ENOTSUPP;
 	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
 		q_properties->format = KFD_QUEUE_FORMAT_AQL;
 	else
 		q_properties->format = KFD_QUEUE_FORMAT_PM4;
 	pr_debug("Queue Percentage (%d, %d)\n",
 			q_properties->queue_percent, args->queue_percentage);
 	pr_debug("Queue Priority (%d, %d)\n",
 			q_properties->priority, args->queue_priority);
 	pr_debug("Queue Address (0x%llX, 0x%llX)\n",
 			q_properties->queue_address, args->ring_base_address);
 	pr_debug("Queue Size (0x%llX, %u)\n",
 			q_properties->queue_size, args->ring_size);
 	pr_debug("Queue r/w Pointers (0x%llX, 0x%llX)\n",
 			(uint64_t) q_properties->read_ptr,
 			(uint64_t) q_properties->write_ptr);
 	pr_debug("Queue Format (%d)\n", q_properties->format);
 	return 0;
 }
 static long kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
 					void __user *arg)
 {
 	struct kfd_ioctl_create_queue_args args;
 	struct kfd_dev *dev;
 	int err = 0;
 	unsigned int queue_id;
 	struct kfd_process_device *pdd;
 	struct queue_properties q_properties;
 	memset(&q_properties, 0, sizeof(struct queue_properties));
 	if (copy_from_user(&args, arg, sizeof(args)))
 		return -EFAULT;
 	pr_debug("kfd: creating queue ioctl\n");
 	err = set_queue_properties_from_user(&q_properties, &args);
 	if (err)
 		return err;
 	dev = kfd_device_by_id(args.gpu_id);
 	if (dev == NULL)
 		return -EINVAL;
 	mutex_lock(&p->mutex);
 	pdd = kfd_bind_process_to_device(dev, p);
 	if (IS_ERR(pdd) < 0) {
 		err = PTR_ERR(pdd);
 		goto err_bind_process;
 	}
 	pr_debug("kfd: creating queue for PASID %d on GPU 0x%x\n",
 			p->pasid,
 			dev->id);
 	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, 0,
 				KFD_QUEUE_TYPE_COMPUTE, &queue_id);
 	if (err != 0)
 		goto err_create_queue;
 	args.queue_id = queue_id;
 	/* Return gpu_id as doorbell offset for mmap usage */
 	args.doorbell_offset = args.gpu_id << PAGE_SHIFT;
 	if (copy_to_user(arg, &args, sizeof(args))) {
 		err = -EFAULT;
 		goto err_copy_args_out;
 	}
 	mutex_unlock(&p->mutex);
 	pr_debug("kfd: queue id %d was created successfully\n", args.queue_id);
 	pr_debug("ring buffer address == 0x%016llX\n",
 			args.ring_base_address);
 	pr_debug("read ptr address    == 0x%016llX\n",
 			args.read_pointer_address);
 	pr_debug("write ptr address   == 0x%016llX\n",
 			args.write_pointer_address);
 	return 0;
 err_copy_args_out:
 	pqm_destroy_queue(&p->pqm, queue_id);
 err_create_queue:
 err_bind_process:
 	mutex_unlock(&p->mutex);
 	return err;
 }
 static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
 					void __user *arg)
 {
 	int retval;
 	struct kfd_ioctl_destroy_queue_args args;
 	if (copy_from_user(&args, arg, sizeof(args)))
 		return -EFAULT;
 	pr_debug("kfd: destroying queue id %d for PASID %d\n",
 				args.queue_id,
 				p->pasid);
 	mutex_lock(&p->mutex);
 	retval = pqm_destroy_queue(&p->pqm, args.queue_id);
 	mutex_unlock(&p->mutex);
 	return retval;
 }
 static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
 					void __user *arg)
 {
 	int retval;
 	struct kfd_ioctl_update_queue_args args;
 	struct queue_properties properties;
 	if (copy_from_user(&args, arg, sizeof(args)))
 		return -EFAULT;
 	if (args.queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) {
 		pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n");
 		return -EINVAL;
 	}
 	if (args.queue_priority > KFD_MAX_QUEUE_PRIORITY) {
 		pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n");
 		return -EINVAL;
 	}
 	if ((args.ring_base_address) &&
 		(!access_ok(VERIFY_WRITE, args.ring_base_address, sizeof(uint64_t)))) {
 		pr_err("kfd: can't access ring base address\n");
 		return -EFAULT;
 	}
 	if (!is_power_of_2(args.ring_size) && (args.ring_size != 0)) {
 		pr_err("kfd: ring size must be a power of 2 or 0\n");
 		return -EINVAL;
 	}
 	properties.queue_address = args.ring_base_address;
 	properties.queue_size = args.ring_size;
 	properties.queue_percent = args.queue_percentage;
 	properties.priority = args.queue_priority;
 	pr_debug("kfd: updating queue id %d for PASID %d\n",
 			args.queue_id, p->pasid);
 	mutex_lock(&p->mutex);
 	retval = pqm_update_queue(&p->pqm, args.queue_id, &properties);
 	mutex_unlock(&p->mutex);
 	return retval;
 }
 static long kfd_ioctl_set_memory_policy(struct file *filep,
 				struct kfd_process *p, void __user *arg)
 {
 	struct kfd_ioctl_set_memory_policy_args args;
 	struct kfd_dev *dev;
 	int err = 0;
 	struct kfd_process_device *pdd;
 	enum cache_policy default_policy, alternate_policy;
 	if (copy_from_user(&args, arg, sizeof(args)))
 		return -EFAULT;
 	if (args.default_policy != KFD_IOC_CACHE_POLICY_COHERENT
 	    && args.default_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) {
 		return -EINVAL;
 	}
 	if (args.alternate_policy != KFD_IOC_CACHE_POLICY_COHERENT
 	    && args.alternate_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) {
 		return -EINVAL;
 	}
 	dev = kfd_device_by_id(args.gpu_id);
 	if (dev == NULL)
 		return -EINVAL;
 	mutex_lock(&p->mutex);
 	pdd = kfd_bind_process_to_device(dev, p);
 	if (IS_ERR(pdd) < 0) {
 		err = PTR_ERR(pdd);
 		goto out;
 	}
 	default_policy = (args.default_policy == KFD_IOC_CACHE_POLICY_COHERENT)
 			 ? cache_policy_coherent : cache_policy_noncoherent;
 	alternate_policy =
 		(args.alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT)
 		   ? cache_policy_coherent : cache_policy_noncoherent;
 	if (!dev->dqm->set_cache_memory_policy(dev->dqm,
 				&pdd->qpd,
 				default_policy,
 				alternate_policy,
 				(void __user *)args.alternate_aperture_base,
 				args.alternate_aperture_size))
 		err = -EINVAL;
 out:
 	mutex_unlock(&p->mutex);
 	return err;
 }
 static long kfd_ioctl_get_clock_counters(struct file *filep,
 				struct kfd_process *p, void __user *arg)
 {
 	struct kfd_ioctl_get_clock_counters_args args;
 	struct kfd_dev *dev;
 	struct timespec time;
 	if (copy_from_user(&args, arg, sizeof(args)))
 		return -EFAULT;
 	dev = kfd_device_by_id(args.gpu_id);
 	if (dev == NULL)
 		return -EINVAL;
 	/* Reading GPU clock counter from KGD */
 	args.gpu_clock_counter = kfd2kgd->get_gpu_clock_counter(dev->kgd);
 	/* No access to rdtsc. Using raw monotonic time */
 	getrawmonotonic(&time);
 	args.cpu_clock_counter = (uint64_t)timespec_to_ns(&time);
 	get_monotonic_boottime(&time);
 	args.system_clock_counter = (uint64_t)timespec_to_ns(&time);
 	/* Since the counter is in nano-seconds we use 1GHz frequency */
 	args.system_clock_freq = 1000000000;
 	if (copy_to_user(arg, &args, sizeof(args)))
 		return -EFAULT;
 	return 0;
 }
 static int kfd_ioctl_get_process_apertures(struct file *filp,
 				struct kfd_process *p, void __user *arg)
 {
 	struct kfd_ioctl_get_process_apertures_args args;
 	struct kfd_process_device_apertures *pAperture;
 	struct kfd_process_device *pdd;
 	dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
 	if (copy_from_user(&args, arg, sizeof(args)))
 		return -EFAULT;
 	args.num_of_nodes = 0;
 	mutex_lock(&p->mutex);
 	/*if the process-device list isn't empty*/
 	if (kfd_has_process_device_data(p)) {
 		/* Run over all pdd of the process */
 		pdd = kfd_get_first_process_device_data(p);
 		do {
 			pAperture = &args.process_apertures[args.num_of_nodes];
 			pAperture->gpu_id = pdd->dev->id;
 			pAperture->lds_base = pdd->lds_base;
 			pAperture->lds_limit = pdd->lds_limit;
 			pAperture->gpuvm_base = pdd->gpuvm_base;
 			pAperture->gpuvm_limit = pdd->gpuvm_limit;
 			pAperture->scratch_base = pdd->scratch_base;
 			pAperture->scratch_limit = pdd->scratch_limit;
 			dev_dbg(kfd_device,
 				"node id %u\n", args.num_of_nodes);
 			dev_dbg(kfd_device,
 				"gpu id %u\n", pdd->dev->id);
 			dev_dbg(kfd_device,
 				"lds_base %llX\n", pdd->lds_base);
 			dev_dbg(kfd_device,
 				"lds_limit %llX\n", pdd->lds_limit);
 			dev_dbg(kfd_device,
 				"gpuvm_base %llX\n", pdd->gpuvm_base);
 			dev_dbg(kfd_device,
 				"gpuvm_limit %llX\n", pdd->gpuvm_limit);
 			dev_dbg(kfd_device,
 				"scratch_base %llX\n", pdd->scratch_base);
 			dev_dbg(kfd_device,
 				"scratch_limit %llX\n", pdd->scratch_limit);
 			args.num_of_nodes++;
 		} while ((pdd = kfd_get_next_process_device_data(p, pdd)) != NULL &&
 				(args.num_of_nodes < NUM_OF_SUPPORTED_GPUS));
 	}
 	mutex_unlock(&p->mutex);
 	if (copy_to_user(arg, &args, sizeof(args)))
 		return -EFAULT;
 	return 0;
 }
 static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 {
 	struct kfd_process *process;
 	long err = -EINVAL;
 	dev_dbg(kfd_device,
 		"ioctl cmd 0x%x (#%d), arg 0x%lx\n",
 		cmd, _IOC_NR(cmd), arg);
 	process = kfd_get_process(current);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 	switch (cmd) {
 	case KFD_IOC_GET_VERSION:
 		err = kfd_ioctl_get_version(filep, process, (void __user *)arg);
 		break;
 	case KFD_IOC_CREATE_QUEUE:
 		err = kfd_ioctl_create_queue(filep, process,
 						(void __user *)arg);
 		break;
 	case KFD_IOC_DESTROY_QUEUE:
 		err = kfd_ioctl_destroy_queue(filep, process,
 						(void __user *)arg);
 		break;
 	case KFD_IOC_SET_MEMORY_POLICY:
 		err = kfd_ioctl_set_memory_policy(filep, process,
 						(void __user *)arg);
 		break;
 	case KFD_IOC_GET_CLOCK_COUNTERS:
 		err = kfd_ioctl_get_clock_counters(filep, process,
 						(void __user *)arg);
 		break;
 	case KFD_IOC_GET_PROCESS_APERTURES:
 		err = kfd_ioctl_get_process_apertures(filep, process,
 						(void __user *)arg);
 		break;
 	case KFD_IOC_UPDATE_QUEUE:
 		err = kfd_ioctl_update_queue(filep, process,
 						(void __user *)arg);
 		break;
 	default:
 		dev_err(kfd_device,
 			"unknown ioctl cmd 0x%x, arg 0x%lx)\n",
 			cmd, arg);
 		err = -EINVAL;
 		break;
 	}
 	if (err < 0)
 		dev_err(kfd_device,
 			"ioctl error %ld for ioctl cmd 0x%x (#%d)\n",
 			err, cmd, _IOC_NR(cmd));
 	return err;
 }
 static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct kfd_process *process;
 	process = kfd_get_process(current);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 	return kfd_doorbell_mmap(process, vma);
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@ -0,0 +1,294 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef KFD_CRAT_H_INCLUDED
 #define KFD_CRAT_H_INCLUDED
 #include <linux/types.h>
 #pragma pack(1)
 /*
 * 4CC signature values for the CRAT and CDIT ACPI tables
 */
 #define CRAT_SIGNATURE	"CRAT"
 #define CDIT_SIGNATURE	"CDIT"
 /*
 * Component Resource Association Table (CRAT)
 */
 #define CRAT_OEMID_LENGTH	6
 #define CRAT_OEMTABLEID_LENGTH	8
 #define CRAT_RESERVED_LENGTH	6
 #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1)
 struct crat_header {
 	uint32_t	signature;
 	uint32_t	length;
 	uint8_t		revision;
 	uint8_t		checksum;
 	uint8_t		oem_id[CRAT_OEMID_LENGTH];
 	uint8_t		oem_table_id[CRAT_OEMTABLEID_LENGTH];
 	uint32_t	oem_revision;
 	uint32_t	creator_id;
 	uint32_t	creator_revision;
 	uint32_t	total_entries;
 	uint16_t	num_domains;
 	uint8_t		reserved[CRAT_RESERVED_LENGTH];
 };
 /*
 * The header structure is immediately followed by total_entries of the
 * data definitions
 */
 /*
 * The currently defined subtype entries in the CRAT
 */
 #define CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY	0
 #define CRAT_SUBTYPE_MEMORY_AFFINITY		1
 #define CRAT_SUBTYPE_CACHE_AFFINITY		2
 #define CRAT_SUBTYPE_TLB_AFFINITY		3
 #define CRAT_SUBTYPE_CCOMPUTE_AFFINITY		4
 #define CRAT_SUBTYPE_IOLINK_AFFINITY		5
 #define CRAT_SUBTYPE_MAX			6
 #define CRAT_SIBLINGMAP_SIZE	32
 /*
 * ComputeUnit Affinity structure and definitions
 */
 #define CRAT_CU_FLAGS_ENABLED		0x00000001
 #define CRAT_CU_FLAGS_HOT_PLUGGABLE	0x00000002
 #define CRAT_CU_FLAGS_CPU_PRESENT	0x00000004
 #define CRAT_CU_FLAGS_GPU_PRESENT	0x00000008
 #define CRAT_CU_FLAGS_IOMMU_PRESENT	0x00000010
 #define CRAT_CU_FLAGS_RESERVED		0xffffffe0
 #define CRAT_COMPUTEUNIT_RESERVED_LENGTH 4
 struct crat_subtype_computeunit {
 	uint8_t		type;
 	uint8_t		length;
 	uint16_t	reserved;
 	uint32_t	flags;
 	uint32_t	proximity_domain;
 	uint32_t	processor_id_low;
 	uint16_t	num_cpu_cores;
 	uint16_t	num_simd_cores;
 	uint16_t	max_waves_simd;
 	uint16_t	io_count;
 	uint16_t	hsa_capability;
 	uint16_t	lds_size_in_kb;
 	uint8_t		wave_front_size;
 	uint8_t		num_banks;
 	uint16_t	micro_engine_id;
 	uint8_t		num_arrays;
 	uint8_t		num_cu_per_array;
 	uint8_t		num_simd_per_cu;
 	uint8_t		max_slots_scatch_cu;
 	uint8_t		reserved2[CRAT_COMPUTEUNIT_RESERVED_LENGTH];
 };
 /*
 * HSA Memory Affinity structure and definitions
 */
 #define CRAT_MEM_FLAGS_ENABLED		0x00000001
 #define CRAT_MEM_FLAGS_HOT_PLUGGABLE	0x00000002
 #define CRAT_MEM_FLAGS_NON_VOLATILE	0x00000004
 #define CRAT_MEM_FLAGS_RESERVED		0xfffffff8
 #define CRAT_MEMORY_RESERVED_LENGTH 8
 struct crat_subtype_memory {
 	uint8_t		type;
 	uint8_t		length;
 	uint16_t	reserved;
 	uint32_t	flags;
 	uint32_t	promixity_domain;
 	uint32_t	base_addr_low;
 	uint32_t	base_addr_high;
 	uint32_t	length_low;
 	uint32_t	length_high;
 	uint32_t	width;
 	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH];
 };
 /*
 * HSA Cache Affinity structure and definitions
 */
 #define CRAT_CACHE_FLAGS_ENABLED	0x00000001
 #define CRAT_CACHE_FLAGS_DATA_CACHE	0x00000002
 #define CRAT_CACHE_FLAGS_INST_CACHE	0x00000004
 #define CRAT_CACHE_FLAGS_CPU_CACHE	0x00000008
 #define CRAT_CACHE_FLAGS_SIMD_CACHE	0x00000010
 #define CRAT_CACHE_FLAGS_RESERVED	0xffffffe0
 #define CRAT_CACHE_RESERVED_LENGTH 8
 struct crat_subtype_cache {
 	uint8_t		type;
 	uint8_t		length;
 	uint16_t	reserved;
 	uint32_t	flags;
 	uint32_t	processor_id_low;
 	uint8_t		sibling_map[CRAT_SIBLINGMAP_SIZE];
 	uint32_t	cache_size;
 	uint8_t		cache_level;
 	uint8_t		lines_per_tag;
 	uint16_t	cache_line_size;
 	uint8_t		associativity;
 	uint8_t		cache_properties;
 	uint16_t	cache_latency;
 	uint8_t		reserved2[CRAT_CACHE_RESERVED_LENGTH];
 };
 /*
 * HSA TLB Affinity structure and definitions
 */
 #define CRAT_TLB_FLAGS_ENABLED	0x00000001
 #define CRAT_TLB_FLAGS_DATA_TLB	0x00000002
 #define CRAT_TLB_FLAGS_INST_TLB	0x00000004
 #define CRAT_TLB_FLAGS_CPU_TLB	0x00000008
 #define CRAT_TLB_FLAGS_SIMD_TLB	0x00000010
 #define CRAT_TLB_FLAGS_RESERVED	0xffffffe0
 #define CRAT_TLB_RESERVED_LENGTH 4
 struct crat_subtype_tlb {
 	uint8_t		type;
 	uint8_t		length;
 	uint16_t	reserved;
 	uint32_t	flags;
 	uint32_t	processor_id_low;
 	uint8_t		sibling_map[CRAT_SIBLINGMAP_SIZE];
 	uint32_t	tlb_level;
 	uint8_t		data_tlb_associativity_2mb;
 	uint8_t		data_tlb_size_2mb;
 	uint8_t		instruction_tlb_associativity_2mb;
 	uint8_t		instruction_tlb_size_2mb;
 	uint8_t		data_tlb_associativity_4k;
 	uint8_t		data_tlb_size_4k;
 	uint8_t		instruction_tlb_associativity_4k;
 	uint8_t		instruction_tlb_size_4k;
 	uint8_t		data_tlb_associativity_1gb;
 	uint8_t		data_tlb_size_1gb;
 	uint8_t		instruction_tlb_associativity_1gb;
 	uint8_t		instruction_tlb_size_1gb;
 	uint8_t		reserved2[CRAT_TLB_RESERVED_LENGTH];
 };
 /*
 * HSA CCompute/APU Affinity structure and definitions
 */
 #define CRAT_CCOMPUTE_FLAGS_ENABLED	0x00000001
 #define CRAT_CCOMPUTE_FLAGS_RESERVED	0xfffffffe
 #define CRAT_CCOMPUTE_RESERVED_LENGTH 16
 struct crat_subtype_ccompute {
 	uint8_t		type;
 	uint8_t		length;
 	uint16_t	reserved;
 	uint32_t	flags;
 	uint32_t	processor_id_low;
 	uint8_t		sibling_map[CRAT_SIBLINGMAP_SIZE];
 	uint32_t	apu_size;
 	uint8_t		reserved2[CRAT_CCOMPUTE_RESERVED_LENGTH];
 };
 /*
 * HSA IO Link Affinity structure and definitions
 */
 #define CRAT_IOLINK_FLAGS_ENABLED	0x00000001
 #define CRAT_IOLINK_FLAGS_COHERENCY	0x00000002
 #define CRAT_IOLINK_FLAGS_RESERVED	0xfffffffc
 /*
 * IO interface types
 */
 #define CRAT_IOLINK_TYPE_UNDEFINED	0
 #define CRAT_IOLINK_TYPE_HYPERTRANSPORT	1
 #define CRAT_IOLINK_TYPE_PCIEXPRESS	2
 #define CRAT_IOLINK_TYPE_OTHER		3
 #define CRAT_IOLINK_TYPE_MAX		255
 #define CRAT_IOLINK_RESERVED_LENGTH 24
 struct crat_subtype_iolink {
 	uint8_t		type;
 	uint8_t		length;
 	uint16_t	reserved;
 	uint32_t	flags;
 	uint32_t	proximity_domain_from;
 	uint32_t	proximity_domain_to;
 	uint8_t		io_interface_type;
 	uint8_t		version_major;
 	uint16_t	version_minor;
 	uint32_t	minimum_latency;
 	uint32_t	maximum_latency;
 	uint32_t	minimum_bandwidth_mbs;
 	uint32_t	maximum_bandwidth_mbs;
 	uint32_t	recommended_transfer_size;
 	uint8_t		reserved2[CRAT_IOLINK_RESERVED_LENGTH];
 };
 /*
 * HSA generic sub-type header
 */
 #define CRAT_SUBTYPE_FLAGS_ENABLED 0x00000001
 struct crat_subtype_generic {
 	uint8_t		type;
 	uint8_t		length;
 	uint16_t	reserved;
 	uint32_t	flags;
 };
 /*
 * Component Locality Distance Information Table (CDIT)
 */
 #define CDIT_OEMID_LENGTH	6
 #define CDIT_OEMTABLEID_LENGTH	8
 struct cdit_header {
 	uint32_t	signature;
 	uint32_t	length;
 	uint8_t		revision;
 	uint8_t		checksum;
 	uint8_t		oem_id[CDIT_OEMID_LENGTH];
 	uint8_t		oem_table_id[CDIT_OEMTABLEID_LENGTH];
 	uint32_t	oem_revision;
 	uint32_t	creator_id;
 	uint32_t	creator_revision;
 	uint32_t	total_entries;
 	uint16_t	num_domains;
 	uint8_t		entry[1];
 };
 #pragma pack()
 #endif /* KFD_CRAT_H_INCLUDED */
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@ -0,0 +1,307 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <linux/amd-iommu.h>
 #include <linux/bsearch.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
 #define MQD_SIZE_ALIGNED 768
 static const struct kfd_device_info kaveri_device_info = {
 	.max_pasid_bits = 16,
 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
 	.mqd_size_aligned = MQD_SIZE_ALIGNED
 };
 struct kfd_deviceid {
 	unsigned short did;
 	const struct kfd_device_info *device_info;
 };
 /* Please keep this sorted by increasing device id. */
 static const struct kfd_deviceid supported_devices[] = {
 	{ 0x1304, &kaveri_device_info },	/* Kaveri */
 	{ 0x1305, &kaveri_device_info },	/* Kaveri */
 	{ 0x1306, &kaveri_device_info },	/* Kaveri */
 	{ 0x1307, &kaveri_device_info },	/* Kaveri */
 	{ 0x1309, &kaveri_device_info },	/* Kaveri */
 	{ 0x130A, &kaveri_device_info },	/* Kaveri */
 	{ 0x130B, &kaveri_device_info },	/* Kaveri */
 	{ 0x130C, &kaveri_device_info },	/* Kaveri */
 	{ 0x130D, &kaveri_device_info },	/* Kaveri */
 	{ 0x130E, &kaveri_device_info },	/* Kaveri */
 	{ 0x130F, &kaveri_device_info },	/* Kaveri */
 	{ 0x1310, &kaveri_device_info },	/* Kaveri */
 	{ 0x1311, &kaveri_device_info },	/* Kaveri */
 	{ 0x1312, &kaveri_device_info },	/* Kaveri */
 	{ 0x1313, &kaveri_device_info },	/* Kaveri */
 	{ 0x1315, &kaveri_device_info },	/* Kaveri */
 	{ 0x1316, &kaveri_device_info },	/* Kaveri */
 	{ 0x1317, &kaveri_device_info },	/* Kaveri */
 	{ 0x1318, &kaveri_device_info },	/* Kaveri */
 	{ 0x131B, &kaveri_device_info },	/* Kaveri */
 	{ 0x131C, &kaveri_device_info },	/* Kaveri */
 	{ 0x131D, &kaveri_device_info },	/* Kaveri */
 };
 static const struct kfd_device_info *lookup_device_info(unsigned short did)
 {
 	size_t i;
 	for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
 		if (supported_devices[i].did == did) {
 			BUG_ON(supported_devices[i].device_info == NULL);
 			return supported_devices[i].device_info;
 		}
 	}
 	return NULL;
 }
 struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev)
 {
 	struct kfd_dev *kfd;
 	const struct kfd_device_info *device_info =
 					lookup_device_info(pdev->device);
 	if (!device_info)
 		return NULL;
 	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
 	if (!kfd)
 		return NULL;
 	kfd->kgd = kgd;
 	kfd->device_info = device_info;
 	kfd->pdev = pdev;
 	kfd->init_complete = false;
 	return kfd;
 }
 static bool device_iommu_pasid_init(struct kfd_dev *kfd)
 {
 	const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
 					AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
 					AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
 	struct amd_iommu_device_info iommu_info;
 	unsigned int pasid_limit;
 	int err;
 	err = amd_iommu_device_info(kfd->pdev, &iommu_info);
 	if (err < 0) {
 		dev_err(kfd_device,
 			"error getting iommu info. is the iommu enabled?\n");
 		return false;
 	}
 	if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) {
 		dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n",
 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0,
 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0,
 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0);
 		return false;
 	}
 	pasid_limit = min_t(unsigned int,
 			(unsigned int)1 << kfd->device_info->max_pasid_bits,
 			iommu_info.max_pasids);
 	/*
 	 * last pasid is used for kernel queues doorbells
 	 * in the future the last pasid might be used for a kernel thread.
 	 */
 	pasid_limit = min_t(unsigned int,
 				pasid_limit,
 				kfd->doorbell_process_limit - 1);
 	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
 	if (err < 0) {
 		dev_err(kfd_device, "error initializing iommu device\n");
 		return false;
 	}
 	if (!kfd_set_pasid_limit(pasid_limit)) {
 		dev_err(kfd_device, "error setting pasid limit\n");
 		amd_iommu_free_device(kfd->pdev);
 		return false;
 	}
 	return true;
 }
 static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
 {
 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
 	if (dev)
 		kfd_unbind_process_from_device(dev, pasid);
 }
 bool kgd2kfd_device_init(struct kfd_dev *kfd,
 			 const struct kgd2kfd_shared_resources *gpu_resources)
 {
 	unsigned int size;
 	kfd->shared_resources = *gpu_resources;
 	/* calculate max size of mqds needed for queues */
 	size = max_num_of_processes *
 		max_num_of_queues_per_process *
 		kfd->device_info->mqd_size_aligned;
 	/* add another 512KB for all other allocations on gart */
 	size += 512 * 1024;
 	if (kfd2kgd->init_sa_manager(kfd->kgd, size)) {
 		dev_err(kfd_device,
 			"Error initializing sa manager for device (%x:%x)\n",
 			kfd->pdev->vendor, kfd->pdev->device);
 		goto out;
 	}
 	kfd_doorbell_init(kfd);
 	if (kfd_topology_add_device(kfd) != 0) {
 		dev_err(kfd_device,
 			"Error adding device (%x:%x) to topology\n",
 			kfd->pdev->vendor, kfd->pdev->device);
 		goto kfd_topology_add_device_error;
 	}
 	if (kfd_interrupt_init(kfd)) {
 		dev_err(kfd_device,
 			"Error initializing interrupts for device (%x:%x)\n",
 			kfd->pdev->vendor, kfd->pdev->device);
 		goto kfd_interrupt_error;
 	}
 	if (!device_iommu_pasid_init(kfd)) {
 		dev_err(kfd_device,
 			"Error initializing iommuv2 for device (%x:%x)\n",
 			kfd->pdev->vendor, kfd->pdev->device);
 		goto device_iommu_pasid_error;
 	}
 	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
 						iommu_pasid_shutdown_callback);
 	kfd->dqm = device_queue_manager_init(kfd);
 	if (!kfd->dqm) {
 		dev_err(kfd_device,
 			"Error initializing queue manager for device (%x:%x)\n",
 			kfd->pdev->vendor, kfd->pdev->device);
 		goto device_queue_manager_error;
 	}
 	if (kfd->dqm->start(kfd->dqm) != 0) {
 		dev_err(kfd_device,
 			"Error starting queuen manager for device (%x:%x)\n",
 			kfd->pdev->vendor, kfd->pdev->device);
 		goto dqm_start_error;
 	}
 	kfd->init_complete = true;
 	dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor,
 		 kfd->pdev->device);
 	pr_debug("kfd: Starting kfd with the following scheduling policy %d\n",
 		sched_policy);
 	goto out;
 dqm_start_error:
 	device_queue_manager_uninit(kfd->dqm);
 device_queue_manager_error:
 	amd_iommu_free_device(kfd->pdev);
 device_iommu_pasid_error:
 	kfd_interrupt_exit(kfd);
 kfd_interrupt_error:
 	kfd_topology_remove_device(kfd);
 kfd_topology_add_device_error:
 	kfd2kgd->fini_sa_manager(kfd->kgd);
 	dev_err(kfd_device,
 		"device (%x:%x) NOT added due to errors\n",
 		kfd->pdev->vendor, kfd->pdev->device);
 out:
 	return kfd->init_complete;
 }
 void kgd2kfd_device_exit(struct kfd_dev *kfd)
 {
 	if (kfd->init_complete) {
 		device_queue_manager_uninit(kfd->dqm);
 		amd_iommu_free_device(kfd->pdev);
 		kfd_interrupt_exit(kfd);
 		kfd_topology_remove_device(kfd);
 	}
 	kfree(kfd);
 }
 void kgd2kfd_suspend(struct kfd_dev *kfd)
 {
 	BUG_ON(kfd == NULL);
 	if (kfd->init_complete) {
 		kfd->dqm->stop(kfd->dqm);
 		amd_iommu_free_device(kfd->pdev);
 	}
 }
 int kgd2kfd_resume(struct kfd_dev *kfd)
 {
 	unsigned int pasid_limit;
 	int err;
 	BUG_ON(kfd == NULL);
 	pasid_limit = kfd_get_pasid_limit();
 	if (kfd->init_complete) {
 		err = amd_iommu_init_device(kfd->pdev, pasid_limit);
 		if (err < 0)
 			return -ENXIO;
 		amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
 						iommu_pasid_shutdown_callback);
 		kfd->dqm->start(kfd->dqm);
 	}
 	return 0;
 }
 /* This is called directly from KGD at ISR. */
 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
 {
 	if (kfd->init_complete) {
 		spin_lock(&kfd->interrupt_lock);
 		if (kfd->interrupts_active
 		    && enqueue_ih_ring_entry(kfd, ih_ring_entry))
 			schedule_work(&kfd->interrupt_work);
 		spin_unlock(&kfd->interrupt_lock);
 	}
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@ -0,0 +1,146 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #ifndef KFD_DEVICE_QUEUE_MANAGER_H_
 #define KFD_DEVICE_QUEUE_MANAGER_H_
 #include <linux/rwsem.h>
 #include <linux/list.h>
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS	(500)
 #define QUEUES_PER_PIPE				(8)
 #define PIPE_PER_ME_CP_SCHEDULING		(3)
 #define CIK_VMID_NUM				(8)
 #define KFD_VMID_START_OFFSET			(8)
 #define VMID_PER_DEVICE				CIK_VMID_NUM
 #define KFD_DQM_FIRST_PIPE			(0)
 struct device_process_node {
 	struct qcm_process_device *qpd;
 	struct list_head list;
 };
 /**
 * struct device_queue_manager
 *
 * @create_queue: Queue creation routine.
 *
 * @destroy_queue: Queue destruction routine.
 *
 * @update_queue: Queue update routine.
 *
 * @get_mqd_manager: Returns the mqd manager according to the mqd type.
 *
 * @exeute_queues: Dispatches the queues list to the H/W.
 *
 * @register_process: This routine associates a specific process with device.
 *
 * @unregister_process: destroys the associations between process to device.
 *
 * @initialize: Initializes the pipelines and memory module for that device.
 *
 * @start: Initializes the resources/modules the the device needs for queues
 * execution. This function is called on device initialization and after the
 * system woke up after suspension.
 *
 * @stop: This routine stops execution of all the active queue running on the
 * H/W and basically this function called on system suspend.
 *
 * @uninitialize: Destroys all the device queue manager resources allocated in
 * initialize routine.
 *
 * @create_kernel_queue: Creates kernel queue. Used for debug queue.
 *
 * @destroy_kernel_queue: Destroys kernel queue. Used for debug queue.
 *
 * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the
 * memory apertures.
 *
 * This struct is a base class for the kfd queues scheduler in the
 * device level. The device base class should expose the basic operations
 * for queue creation and queue destruction. This base class hides the
 * scheduling mode of the driver and the specific implementation of the
 * concrete device. This class is the only class in the queues scheduler
 * that configures the H/W.
 */
 struct device_queue_manager {
 	int	(*create_queue)(struct device_queue_manager *dqm,
 				struct queue *q,
 				struct qcm_process_device *qpd,
 				int *allocate_vmid);
 	int	(*destroy_queue)(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q);
 	int	(*update_queue)(struct device_queue_manager *dqm,
 				struct queue *q);
 	struct mqd_manager * (*get_mqd_manager)
 					(struct device_queue_manager *dqm,
 					enum KFD_MQD_TYPE type);
 	int	(*register_process)(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
 	int	(*unregister_process)(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
 	int	(*initialize)(struct device_queue_manager *dqm);
 	int	(*start)(struct device_queue_manager *dqm);
 	int	(*stop)(struct device_queue_manager *dqm);
 	void	(*uninitialize)(struct device_queue_manager *dqm);
 	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
 					struct kernel_queue *kq,
 					struct qcm_process_device *qpd);
 	void	(*destroy_kernel_queue)(struct device_queue_manager *dqm,
 					struct kernel_queue *kq,
 					struct qcm_process_device *qpd);
 	bool	(*set_cache_memory_policy)(struct device_queue_manager *dqm,
 					   struct qcm_process_device *qpd,
 					   enum cache_policy default_policy,
 					   enum cache_policy alternate_policy,
 					   void __user *alternate_aperture_base,
 					   uint64_t alternate_aperture_size);
 	struct mqd_manager	*mqds[KFD_MQD_TYPE_MAX];
 	struct packet_manager	packets;
 	struct kfd_dev		*dev;
 	struct mutex		lock;
 	struct list_head	queues;
 	unsigned int		processes_count;
 	unsigned int		queue_count;
 	unsigned int		next_pipe_to_allocate;
 	unsigned int		*allocated_queues;
 	unsigned int		vmid_bitmap;
 	uint64_t		pipelines_addr;
 	struct kfd_mem_obj	*pipeline_mem;
 	uint64_t		fence_gpu_addr;
 	unsigned int		*fence_addr;
 	struct kfd_mem_obj	*fence_mem;
 	bool			active_runlist;
 };
 #endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */
--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@ -0,0 +1,255 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "kfd_priv.h"
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
 /*
 * This extension supports a kernel level doorbells management for
 * the kernel queues.
 * Basically the last doorbells page is devoted to kernel queues
 * and that's assures that any user process won't get access to the
 * kernel doorbells page
 */
 static DEFINE_MUTEX(doorbell_mutex);
 static unsigned long doorbell_available_index[
 	DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)] = { 0 };
 #define KERNEL_DOORBELL_PASID 1
 #define KFD_SIZE_OF_DOORBELL_IN_BYTES 4
 /*
 * Each device exposes a doorbell aperture, a PCI MMIO aperture that
 * receives 32-bit writes that are passed to queues as wptr values.
 * The doorbells are intended to be written by applications as part
 * of queueing work on user-mode queues.
 * We assign doorbells to applications in PAGE_SIZE-sized and aligned chunks.
 * We map the doorbell address space into user-mode when a process creates
 * its first queue on each device.
 * Although the mapping is done by KFD, it is equivalent to an mmap of
 * the /dev/kfd with the particular device encoded in the mmap offset.
 * There will be other uses for mmap of /dev/kfd, so only a range of
 * offsets (KFD_MMAP_DOORBELL_START-END) is used for doorbells.
 */
 /* # of doorbell bytes allocated for each process. */
 static inline size_t doorbell_process_allocation(void)
 {
 	return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES *
 			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
 			PAGE_SIZE);
 }
 /* Doorbell calculations for device init. */
 void kfd_doorbell_init(struct kfd_dev *kfd)
 {
 	size_t doorbell_start_offset;
 	size_t doorbell_aperture_size;
 	size_t doorbell_process_limit;
 	/*
 	 * We start with calculations in bytes because the input data might
 	 * only be byte-aligned.
 	 * Only after we have done the rounding can we assume any alignment.
 	 */
 	doorbell_start_offset =
 			roundup(kfd->shared_resources.doorbell_start_offset,
 					doorbell_process_allocation());
 	doorbell_aperture_size =
 			rounddown(kfd->shared_resources.doorbell_aperture_size,
 					doorbell_process_allocation());
 	if (doorbell_aperture_size > doorbell_start_offset)
 		doorbell_process_limit =
 			(doorbell_aperture_size - doorbell_start_offset) /
 						doorbell_process_allocation();
 	else
 		doorbell_process_limit = 0;
 	kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address +
 				doorbell_start_offset;
 	kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32);
 	kfd->doorbell_process_limit = doorbell_process_limit - 1;
 	kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base,
 						doorbell_process_allocation());
 	BUG_ON(!kfd->doorbell_kernel_ptr);
 	pr_debug("kfd: doorbell initialization:\n");
 	pr_debug("kfd: doorbell base           == 0x%08lX\n",
 			(uintptr_t)kfd->doorbell_base);
 	pr_debug("kfd: doorbell_id_offset      == 0x%08lX\n",
 			kfd->doorbell_id_offset);
 	pr_debug("kfd: doorbell_process_limit  == 0x%08lX\n",
 			doorbell_process_limit);
 	pr_debug("kfd: doorbell_kernel_offset  == 0x%08lX\n",
 			(uintptr_t)kfd->doorbell_base);
 	pr_debug("kfd: doorbell aperture size  == 0x%08lX\n",
 			kfd->shared_resources.doorbell_aperture_size);
 	pr_debug("kfd: doorbell kernel address == 0x%08lX\n",
 			(uintptr_t)kfd->doorbell_kernel_ptr);
 }
 int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
 {
 	phys_addr_t address;
 	struct kfd_dev *dev;
 	/*
 	 * For simplicitly we only allow mapping of the entire doorbell
 	 * allocation of a single device & process.
 	 */
 	if (vma->vm_end - vma->vm_start != doorbell_process_allocation())
 		return -EINVAL;
 	/* Find kfd device according to gpu id */
 	dev = kfd_device_by_id(vma->vm_pgoff);
 	if (dev == NULL)
 		return -EINVAL;
 	/* Find if pdd exists for combination of process and gpu id */
 	if (!kfd_get_process_device_data(dev, process, 0))
 		return -EINVAL;
 	/* Calculate physical address of doorbell */
 	address = kfd_get_process_doorbells(dev, process);
 	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
 				VM_DONTDUMP | VM_PFNMAP;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n"
 		 "     target user address == 0x%08llX\n"
 		 "     physical address    == 0x%08llX\n"
 		 "     vm_flags            == 0x%04lX\n"
 		 "     size                == 0x%04lX\n",
 		 (unsigned long long) vma->vm_start, address, vma->vm_flags,
 		 doorbell_process_allocation());
 	return io_remap_pfn_range(vma,
 				vma->vm_start,
 				address >> PAGE_SHIFT,
 				doorbell_process_allocation(),
 				vma->vm_page_prot);
 }
 /* get kernel iomem pointer for a doorbell */
 u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
 					unsigned int *doorbell_off)
 {
 	u32 inx;
 	BUG_ON(!kfd || !doorbell_off);
 	mutex_lock(&doorbell_mutex);
 	inx = find_first_zero_bit(doorbell_available_index,
 					KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
 	__set_bit(inx, doorbell_available_index);
 	mutex_unlock(&doorbell_mutex);
 	if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
 		return NULL;
 	/*
 	 * Calculating the kernel doorbell offset using "faked" kernel
 	 * pasid that allocated for kernel queues only
 	 */
 	*doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() /
 							sizeof(u32)) + inx;
 	pr_debug("kfd: get kernel queue doorbell\n"
 			 "     doorbell offset   == 0x%08d\n"
 			 "     kernel address    == 0x%08lX\n",
 		*doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx));
 	return kfd->doorbell_kernel_ptr + inx;
 }
 void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr)
 {
 	unsigned int inx;
 	BUG_ON(!kfd || !db_addr);
 	inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr);
 	mutex_lock(&doorbell_mutex);
 	__clear_bit(inx, doorbell_available_index);
 	mutex_unlock(&doorbell_mutex);
 }
 inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
 {
 	if (db) {
 		writel(value, db);
 		pr_debug("writing %d to doorbell address 0x%p\n", value, db);
 	}
 }
 /*
 * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1
 * to doorbells with the process's doorbell page
 */
 unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
 					struct kfd_process *process,
 					unsigned int queue_id)
 {
 	/*
 	 * doorbell_id_offset accounts for doorbells taken by KGD.
 	 * pasid * doorbell_process_allocation/sizeof(u32) adjusts
 	 * to the process's doorbells
 	 */
 	return kfd->doorbell_id_offset +
 		process->pasid * (doorbell_process_allocation()/sizeof(u32)) +
 		queue_id;
 }
 uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
 {
 	uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size -
 				kfd->shared_resources.doorbell_start_offset) /
 					doorbell_process_allocation() + 1;
 	return num_of_elems;
 }
 phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
 					struct kfd_process *process)
 {
 	return dev->doorbell_base +
 		process->pasid * doorbell_process_allocation();
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@ -0,0 +1,355 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/err.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
 #include <uapi/linux/kfd_ioctl.h>
 #include <linux/time.h>
 #include "kfd_priv.h"
 #include <linux/mm.h>
 #include <uapi/asm-generic/mman-common.h>
 #include <asm/processor.h>
 /*
 * The primary memory I/O features being added for revisions of gfxip
 * beyond 7.0 (Kaveri) are:
 *
 * Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b
 *
 * “Flat” shader memory access – These are new shader vector memory
 * operations that do not reference a T#/V# so a “pointer” is what is
 * sourced from the vector gprs for direct access to memory.
 * This pointer space has the Shared(LDS) and Private(Scratch) memory
 * mapped into this pointer space as apertures.
 * The hardware then determines how to direct the memory request
 * based on what apertures the request falls in.
 *
 * Unaligned support and alignment check
 *
 *
 * System Unified Address - SUA
 *
 * The standard usage for GPU virtual addresses are that they are mapped by
 * a set of page tables we call GPUVM and these page tables are managed by
 * a combination of vidMM/driver software components.  The current virtual
 * address (VA) range for GPUVM is 40b.
 *
 * As of gfxip7.1 and beyond we’re adding the ability for compute memory
 * clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access
 * the same page tables used by host x86 processors and that are managed by
 * the operating system. This is via a technique and hardware called ATC/IOMMU.
 * The GPU has the capability of accessing both the GPUVM and ATC address
 * spaces for a given VMID (process) simultaneously and we call this feature
 * system unified address (SUA).
 *
 * There are three fundamental address modes of operation for a given VMID
 * (process) on the GPU:
 *
 *	HSA64 – 64b pointers and the default address space is ATC
 *	HSA32 – 32b pointers and the default address space is ATC
 *	GPUVM – 64b pointers and the default address space is GPUVM (driver
 *		model mode)
 *
 *
 * HSA64 - ATC/IOMMU 64b
 *
 * A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized
 * by the CPU so an AMD CPU can only access the high area
 * (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space
 * so the actual VA carried to translation is 48b.  There is a “hole” in
 * the middle of the 64b VA space.
 *
 * The GPU not only has access to all of the CPU accessible address space via
 * ATC/IOMMU, but it also has access to the GPUVM address space.  The “system
 * unified address” feature (SUA) is the mapping of GPUVM and ATC address
 * spaces into a unified pointer space.  The method we take for 64b mode is
 * to map the full 40b GPUVM address space into the hole of the 64b address
 * space.
 * The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we
 * direct requests to be translated via GPUVM page tables instead of the
 * IOMMU path.
 *
 *
 * 64b to 49b Address conversion
 *
 * Note that there are still significant portions of unused regions (holes)
 * in the 64b address space even for the GPU.  There are several places in
 * the pipeline (sw and hw), we wish to compress the 64b virtual address
 * to a 49b address.  This 49b address is constituted of an “ATC” bit
 * plus a 48b virtual address.  This 49b address is what is passed to the
 * translation hardware.  ATC==0 means the 48b address is a GPUVM address
 * (max of 2^40 – 1) intended to be translated via GPUVM page tables.
 * ATC==1 means the 48b address is intended to be translated via IOMMU
 * page tables.
 *
 * A 64b pointer is compared to the apertures that are defined (Base/Limit), in
 * this case the GPUVM aperture (red) is defined and if a pointer falls in this
 * aperture, we subtract the GPUVM_Base address and set the ATC bit to zero
 * as part of the 64b to 49b conversion.
 *
 * Where this 64b to 49b conversion is done is a function of the usage.
 * Most GPU memory access is via memory objects where the driver builds
 * a descriptor which consists of a base address and a memory access by
 * the GPU usually consists of some kind of an offset or Cartesian coordinate
 * that references this memory descriptor.  This is the case for shader
 * instructions that reference the T# or V# constants, or for specified
 * locations of assets (ex. the shader program location).  In these cases
 * the driver is what handles the 64b to 49b conversion and the base
 * address in the descriptor (ex. V# or T# or shader program location)
 * is defined as a 48b address w/ an ATC bit.  For this usage a given
 * memory object cannot straddle multiple apertures in the 64b address
 * space. For example a shader program cannot jump in/out between ATC
 * and GPUVM space.
 *
 * In some cases we wish to pass a 64b pointer to the GPU hardware and
 * the GPU hw does the 64b to 49b conversion before passing memory
 * requests to the cache/memory system.  This is the case for the
 * S_LOAD and FLAT_* shader memory instructions where we have 64b pointers
 * in scalar and vector GPRs respectively.
 *
 * In all cases (no matter where the 64b -> 49b conversion is done), the gfxip
 * hardware sends a 48b address along w/ an ATC bit, to the memory controller
 * on the memory request interfaces.
 *
 *	<client>_MC_rdreq_atc   // read request ATC bit
 *
 *		0 : <client>_MC_rdreq_addr is a GPUVM VA
 *
 *		1 : <client>_MC_rdreq_addr is a ATC VA
 *
 *
 * “Spare” aperture (APE1)
 *
 * We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use
 * apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the
 * config tables for setting cache policies. The “spare” (APE1) aperture is
 * motivated by getting a different Mtype from the default.
 * The default aperture isn’t an actual base/limit aperture; it is just the
 * address space that doesn’t hit any defined base/limit apertures.
 * The following diagram is a complete picture of the gfxip7.x SUA apertures.
 * The APE1 can be placed either below or above
 * the hole (cannot be in the hole).
 *
 *
 * General Aperture definitions and rules
 *
 * An aperture register definition consists of a Base, Limit, Mtype, and
 * usually an ATC bit indicating which translation tables that aperture uses.
 * In all cases (for SUA and DUA apertures discussed later), aperture base
 * and limit definitions are 64KB aligned.
 *
 *	<ape>_Base[63:0] = { <ape>_Base_register[63:16], 0x0000 }
 *
 *	<ape>_Limit[63:0] = { <ape>_Limit_register[63:16], 0xFFFF }
 *
 * The base and limit are considered inclusive to an aperture so being
 * inside an aperture means (address >= Base) AND (address <= Limit).
 *
 * In no case is a payload that straddles multiple apertures expected to work.
 * For example a load_dword_x4 that starts in one aperture and ends in another,
 * does not work.  For the vector FLAT_* ops we have detection capability in
 * the shader for reporting a “memory violation” back to the
 * SQ block for use in traps.
 * A memory violation results when an op falls into the hole,
 * or a payload straddles multiple apertures.  The S_LOAD instruction
 * does not have this detection.
 *
 * Apertures cannot overlap.
 *
 *
 *
 * HSA32 - ATC/IOMMU 32b
 *
 * For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR
 * instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b
 * will not fit so there is only partial visibility to the GPUVM
 * space (defined by the aperture) for S_LOAD and FLAT_* ops.
 * There is no spare (APE1) aperture for HSA32 mode.
 *
 *
 * GPUVM 64b mode (driver model)
 *
 * This mode is related to HSA64 in that the difference really is that
 * the default aperture is GPUVM (ATC==0) and not ATC space.
 * We have gfxip7.x hardware that has FLAT_* and S_LOAD support for
 * SUA GPUVM mode, but does not support HSA32/HSA64.
 *
 *
 * Device Unified Address - DUA
 *
 * Device unified address (DUA) is the name of the feature that maps the
 * Shared(LDS) memory and Private(Scratch) memory into the overall address
 * space for use by the new FLAT_* vector memory ops.  The Shared and
 * Private memories are mapped as apertures into the address space,
 * and the hardware detects when a FLAT_* memory request is to be redirected
 * to the LDS or Scratch memory when it falls into one of these apertures.
 * Like the SUA apertures, the Shared/Private apertures are 64KB aligned and
 * the base/limit is “in” the aperture. For both HSA64 and GPUVM SUA modes,
 * the Shared/Private apertures are always placed in a limited selection of
 * options in the hole of the 64b address space. For HSA32 mode, the
 * Shared/Private apertures can be placed anywhere in the 32b space
 * except at 0.
 *
 *
 * HSA64 Apertures for FLAT_* vector ops
 *
 * For HSA64 SUA mode, the Shared and Private apertures are always placed
 * in the hole w/ a limited selection of possible locations. The requests
 * that fall in the private aperture are expanded as a function of the
 * work-item id (tid) and redirected to the location of the
 * “hidden private memory”. The hidden private can be placed in either GPUVM
 * or ATC space. The addresses that fall in the shared aperture are
 * re-directed to the on-chip LDS memory hardware.
 *
 *
 * HSA32 Apertures for FLAT_* vector ops
 *
 * In HSA32 mode, the Private and Shared apertures can be placed anywhere
 * in the 32b space except at 0 (Private or Shared Base at zero disables
 * the apertures). If the base address of the apertures are non-zero
 * (ie apertures exists), the size is always 64KB.
 *
 *
 * GPUVM Apertures for FLAT_* vector ops
 *
 * In GPUVM mode, the Shared/Private apertures are specified identically
 * to HSA64 mode where they are always in the hole at a limited selection
 * of locations.
 *
 *
 * Aperture Definitions for SUA and DUA
 *
 * The interpretation of the aperture register definitions for a given
 * VMID is a function of the “SUA Mode” which is one of HSA64, HSA32, or
 * GPUVM64 discussed in previous sections. The mode is first decoded, and
 * then the remaining register decode is a function of the mode.
 *
 *
 * SUA Mode Decode
 *
 * For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from
 * the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and
 * the SH_MEM_CONFIG:PTR32 bits.
 *
 * COMPUTE_DISPATCH_INITIATOR:DATA_ATC    SH_MEM_CONFIG:PTR32        Mode
 *
 * 1                                              0                  HSA64
 *
 * 1                                              1                  HSA32
 *
 * 0                                              X                 GPUVM64
 *
 * In general the hardware will ignore the PTR32 bit and treat
 * as “0” whenever DATA_ATC = “0”, but sw should set PTR32=0
 * when DATA_ATC=0.
 *
 * The DATA_ATC bit is only set for compute dispatches.
 * All “Draw” dispatches are hardcoded to GPUVM64 mode
 * for FLAT_* / S_LOAD operations.
 */
 #define MAKE_GPUVM_APP_BASE(gpu_num) \
 	(((uint64_t)(gpu_num) << 61) + 0x1000000000000)
 #define MAKE_GPUVM_APP_LIMIT(base) \
 	(((uint64_t)(base) & 0xFFFFFF0000000000) | 0xFFFFFFFFFF)
 #define MAKE_SCRATCH_APP_BASE(gpu_num) \
 	(((uint64_t)(gpu_num) << 61) + 0x100000000)
 #define MAKE_SCRATCH_APP_LIMIT(base) \
 	(((uint64_t)base & 0xFFFFFFFF00000000) | 0xFFFFFFFF)
 #define MAKE_LDS_APP_BASE(gpu_num) \
 	(((uint64_t)(gpu_num) << 61) + 0x0)
 #define MAKE_LDS_APP_LIMIT(base) \
 	(((uint64_t)(base) & 0xFFFFFFFF00000000) | 0xFFFFFFFF)
 int kfd_init_apertures(struct kfd_process *process)
 {
 	uint8_t id  = 0;
 	struct kfd_dev *dev;
 	struct kfd_process_device *pdd;
 	mutex_lock(&process->mutex);
 	/*Iterating over all devices*/
 	while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL &&
 		id < NUM_OF_SUPPORTED_GPUS) {
 		pdd = kfd_get_process_device_data(dev, process, 1);
 		/*
 		 * For 64 bit process aperture will be statically reserved in
 		 * the x86_64 non canonical process address space
 		 * amdkfd doesn't currently support apertures for 32 bit process
 		 */
 		if (process->is_32bit_user_mode) {
 			pdd->lds_base = pdd->lds_limit = 0;
 			pdd->gpuvm_base = pdd->gpuvm_limit = 0;
 			pdd->scratch_base = pdd->scratch_limit = 0;
 		} else {
 			/*
 			 * node id couldn't be 0 - the three MSB bits of
 			 * aperture shoudn't be 0
 			 */
 			pdd->lds_base = MAKE_LDS_APP_BASE(id + 1);
 			pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
 			pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
 			pdd->gpuvm_limit =
 					MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base);
 			pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1);
 			pdd->scratch_limit =
 				MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
 		}
 		dev_dbg(kfd_device, "node id %u\n", id);
 		dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id);
 		dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base);
 		dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit);
 		dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base);
 		dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit);
 		dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base);
 		dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit);
 		id++;
 	}
 	mutex_unlock(&process->mutex);
 	return 0;
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@ -0,0 +1,176 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 /*
 * KFD Interrupts.
 *
 * AMD GPUs deliver interrupts by pushing an interrupt description onto the
 * interrupt ring and then sending an interrupt. KGD receives the interrupt
 * in ISR and sends us a pointer to each new entry on the interrupt ring.
 *
 * We generally can't process interrupt-signaled events from ISR, so we call
 * out to each interrupt client module (currently only the scheduler) to ask if
 * each interrupt is interesting. If they return true, then it requires further
 * processing so we copy it to an internal interrupt ring and call each
 * interrupt client again from a work-queue.
 *
 * There's no acknowledgment for the interrupts we use. The hardware simply
 * queues a new interrupt each time without waiting.
 *
 * The fixed-size internal queue means that it's possible for us to lose
 * interrupts because we have no back-pressure to the hardware.
 */
 #include <linux/slab.h>
 #include <linux/device.h>
 #include "kfd_priv.h"
 #define KFD_INTERRUPT_RING_SIZE 256
 static void interrupt_wq(struct work_struct *);
 int kfd_interrupt_init(struct kfd_dev *kfd)
 {
 	void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE,
 					kfd->device_info->ih_ring_entry_size,
 					GFP_KERNEL);
 	if (!interrupt_ring)
 		return -ENOMEM;
 	kfd->interrupt_ring = interrupt_ring;
 	kfd->interrupt_ring_size =
 		KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size;
 	atomic_set(&kfd->interrupt_ring_wptr, 0);
 	atomic_set(&kfd->interrupt_ring_rptr, 0);
 	spin_lock_init(&kfd->interrupt_lock);
 	INIT_WORK(&kfd->interrupt_work, interrupt_wq);
 	kfd->interrupts_active = true;
 	/*
 	 * After this function returns, the interrupt will be enabled. This
 	 * barrier ensures that the interrupt running on a different processor
 	 * sees all the above writes.
 	 */
 	smp_wmb();
 	return 0;
 }
 void kfd_interrupt_exit(struct kfd_dev *kfd)
 {
 	/*
 	 * Stop the interrupt handler from writing to the ring and scheduling
 	 * workqueue items. The spinlock ensures that any interrupt running
 	 * after we have unlocked sees interrupts_active = false.
 	 */
 	unsigned long flags;
 	spin_lock_irqsave(&kfd->interrupt_lock, flags);
 	kfd->interrupts_active = false;
 	spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
 	/*
 	 * Flush_scheduled_work ensures that there are no outstanding
 	 * work-queue items that will access interrupt_ring. New work items
 	 * can't be created because we stopped interrupt handling above.
 	 */
 	flush_scheduled_work();
 	kfree(kfd->interrupt_ring);
 }
 /*
 * This assumes that it can't be called concurrently with itself
 * but only with dequeue_ih_ring_entry.
 */
 bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry)
 {
 	unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr);
 	unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr);
 	if ((rptr - wptr) % kfd->interrupt_ring_size ==
 					kfd->device_info->ih_ring_entry_size) {
 		/* This is very bad, the system is likely to hang. */
 		dev_err_ratelimited(kfd_chardev(),
 			"Interrupt ring overflow, dropping interrupt.\n");
 		return false;
 	}
 	memcpy(kfd->interrupt_ring + wptr, ih_ring_entry,
 			kfd->device_info->ih_ring_entry_size);
 	wptr = (wptr + kfd->device_info->ih_ring_entry_size) %
 			kfd->interrupt_ring_size;
 	smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */
 	atomic_set(&kfd->interrupt_ring_wptr, wptr);
 	return true;
 }
 /*
 * This assumes that it can't be called concurrently with itself
 * but only with enqueue_ih_ring_entry.
 */
 static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry)
 {
 	/*
 	 * Assume that wait queues have an implicit barrier, i.e. anything that
 	 * happened in the ISR before it queued work is visible.
 	 */
 	unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr);
 	unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr);
 	if (rptr == wptr)
 		return false;
 	memcpy(ih_ring_entry, kfd->interrupt_ring + rptr,
 			kfd->device_info->ih_ring_entry_size);
 	rptr = (rptr + kfd->device_info->ih_ring_entry_size) %
 			kfd->interrupt_ring_size;
 	/*
 	 * Ensure the rptr write update is not visible until
 	 * memcpy has finished reading.
 	 */
 	smp_mb();
 	atomic_set(&kfd->interrupt_ring_rptr, rptr);
 	return true;
 }
 static void interrupt_wq(struct work_struct *work)
 {
 	struct kfd_dev *dev = container_of(work, struct kfd_dev,
 						interrupt_work);
 	uint32_t ih_ring_entry[DIV_ROUND_UP(
 				dev->device_info->ih_ring_entry_size,
 				sizeof(uint32_t))];
 	while (dequeue_ih_ring_entry(dev, ih_ring_entry))
 		;
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@ -0,0 +1,347 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #include <linux/types.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/printk.h>
 #include "kfd_kernel_queue.h"
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_pm4_headers.h"
 #include "kfd_pm4_opcodes.h"
 #define PM4_COUNT_ZERO (((1 << 15) - 1) << 16)
 static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
 		enum kfd_queue_type type, unsigned int queue_size)
 {
 	struct queue_properties prop;
 	int retval;
 	union PM4_MES_TYPE_3_HEADER nop;
 	BUG_ON(!kq || !dev);
 	BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ);
 	pr_debug("kfd: In func %s initializing queue type %d size %d\n",
 			__func__, KFD_QUEUE_TYPE_HIQ, queue_size);
 	nop.opcode = IT_NOP;
 	nop.type = PM4_TYPE_3;
 	nop.u32all |= PM4_COUNT_ZERO;
 	kq->dev = dev;
 	kq->nop_packet = nop.u32all;
 	switch (type) {
 	case KFD_QUEUE_TYPE_DIQ:
 	case KFD_QUEUE_TYPE_HIQ:
 		kq->mqd = dev->dqm->get_mqd_manager(dev->dqm,
 						KFD_MQD_TYPE_CIK_HIQ);
 		break;
 	default:
 		BUG();
 		break;
 	}
 	if (kq->mqd == NULL)
 		return false;
 	prop.doorbell_ptr =
 		(uint32_t *)kfd_get_kernel_doorbell(dev, &prop.doorbell_off);
 	if (prop.doorbell_ptr == NULL)
 		goto err_get_kernel_doorbell;
 	retval = kfd2kgd->allocate_mem(dev->kgd,
 					queue_size,
 					PAGE_SIZE,
 					KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
 					(struct kgd_mem **) &kq->pq);
 	if (retval != 0)
 		goto err_pq_allocate_vidmem;
 	kq->pq_kernel_addr = kq->pq->cpu_ptr;
 	kq->pq_gpu_addr = kq->pq->gpu_addr;
 	retval = kfd2kgd->allocate_mem(dev->kgd,
 					sizeof(*kq->rptr_kernel),
 					32,
 					KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
 					(struct kgd_mem **) &kq->rptr_mem);
 	if (retval != 0)
 		goto err_rptr_allocate_vidmem;
 	kq->rptr_kernel = kq->rptr_mem->cpu_ptr;
 	kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr;
 	retval = kfd2kgd->allocate_mem(dev->kgd,
 					sizeof(*kq->wptr_kernel),
 					32,
 					KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
 					(struct kgd_mem **) &kq->wptr_mem);
 	if (retval != 0)
 		goto err_wptr_allocate_vidmem;
 	kq->wptr_kernel = kq->wptr_mem->cpu_ptr;
 	kq->wptr_gpu_addr = kq->wptr_mem->gpu_addr;
 	memset(kq->pq_kernel_addr, 0, queue_size);
 	memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel));
 	memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel));
 	prop.queue_size = queue_size;
 	prop.is_interop = false;
 	prop.priority = 1;
 	prop.queue_percent = 100;
 	prop.type = type;
 	prop.vmid = 0;
 	prop.queue_address = kq->pq_gpu_addr;
 	prop.read_ptr = (uint32_t *) kq->rptr_gpu_addr;
 	prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr;
 	if (init_queue(&kq->queue, prop) != 0)
 		goto err_init_queue;
 	kq->queue->device = dev;
 	kq->queue->process = kfd_get_process(current);
 	retval = kq->mqd->init_mqd(kq->mqd, &kq->queue->mqd,
 					&kq->queue->mqd_mem_obj,
 					&kq->queue->gart_mqd_addr,
 					&kq->queue->properties);
 	if (retval != 0)
 		goto err_init_mqd;
 	/* assign HIQ to HQD */
 	if (type == KFD_QUEUE_TYPE_HIQ) {
 		pr_debug("assigning hiq to hqd\n");
 		kq->queue->pipe = KFD_CIK_HIQ_PIPE;
 		kq->queue->queue = KFD_CIK_HIQ_QUEUE;
 		kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe,
 					kq->queue->queue, NULL);
 	} else {
 		/* allocate fence for DIQ */
 		retval = kfd2kgd->allocate_mem(dev->kgd,
 					sizeof(uint32_t),
 					32,
 					KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
 					(struct kgd_mem **) &kq->fence_mem_obj);
 		if (retval != 0)
 			goto err_alloc_fence;
 		kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr;
 		kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr;
 	}
 	print_queue(kq->queue);
 	return true;
 err_alloc_fence:
 err_init_mqd:
 	uninit_queue(kq->queue);
 err_init_queue:
 	kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->wptr_mem);
 err_wptr_allocate_vidmem:
 	kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->rptr_mem);
 err_rptr_allocate_vidmem:
 	kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->pq);
 err_pq_allocate_vidmem:
 	pr_err("kfd: error init pq\n");
 	kfd_release_kernel_doorbell(dev, (u32 *)prop.doorbell_ptr);
 err_get_kernel_doorbell:
 	pr_err("kfd: error init doorbell");
 	return false;
 }
 static void uninitialize(struct kernel_queue *kq)
 {
 	BUG_ON(!kq);
 	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
 		kq->mqd->destroy_mqd(kq->mqd,
 					NULL,
 					false,
 					QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
 					kq->queue->pipe,
 					kq->queue->queue);
 	kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->rptr_mem);
 	kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->wptr_mem);
 	kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->pq);
 	kfd_release_kernel_doorbell(kq->dev,
 				(u32 *)kq->queue->properties.doorbell_ptr);
 	uninit_queue(kq->queue);
 }
 static int acquire_packet_buffer(struct kernel_queue *kq,
 		size_t packet_size_in_dwords, unsigned int **buffer_ptr)
 {
 	size_t available_size;
 	size_t queue_size_dwords;
 	uint32_t wptr, rptr;
 	unsigned int *queue_address;
 	BUG_ON(!kq || !buffer_ptr);
 	rptr = *kq->rptr_kernel;
 	wptr = *kq->wptr_kernel;
 	queue_address = (unsigned int *)kq->pq_kernel_addr;
 	queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t);
 	pr_debug("kfd: In func %s\nrptr: %d\nwptr: %d\nqueue_address 0x%p\n",
 			__func__, rptr, wptr, queue_address);
 	available_size = (rptr - 1 - wptr + queue_size_dwords) %
 							queue_size_dwords;
 	if (packet_size_in_dwords >= queue_size_dwords ||
 			packet_size_in_dwords >= available_size)
 		return -ENOMEM;
 	if (wptr + packet_size_in_dwords >= queue_size_dwords) {
 		while (wptr > 0) {
 			queue_address[wptr] = kq->nop_packet;
 			wptr = (wptr + 1) % queue_size_dwords;
 		}
 	}
 	*buffer_ptr = &queue_address[wptr];
 	kq->pending_wptr = wptr + packet_size_in_dwords;
 	return 0;
 }
 static void submit_packet(struct kernel_queue *kq)
 {
 #ifdef DEBUG
 	int i;
 #endif
 	BUG_ON(!kq);
 #ifdef DEBUG
 	for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) {
 		pr_debug("0x%2X ", kq->pq_kernel_addr[i]);
 		if (i % 15 == 0)
 			pr_debug("\n");
 	}
 	pr_debug("\n");
 #endif
 	*kq->wptr_kernel = kq->pending_wptr;
 	write_kernel_doorbell((u32 *)kq->queue->properties.doorbell_ptr,
 				kq->pending_wptr);
 }
 static int sync_with_hw(struct kernel_queue *kq, unsigned long timeout_ms)
 {
 	unsigned long org_timeout_ms;
 	BUG_ON(!kq);
 	org_timeout_ms = timeout_ms;
 	timeout_ms += jiffies * 1000 / HZ;
 	while (*kq->wptr_kernel != *kq->rptr_kernel) {
 		if (time_after(jiffies * 1000 / HZ, timeout_ms)) {
 			pr_err("kfd: kernel_queue %s timeout expired %lu\n",
 				__func__, org_timeout_ms);
 			pr_err("kfd: wptr: %d rptr: %d\n",
 				*kq->wptr_kernel, *kq->rptr_kernel);
 			return -ETIME;
 		}
 		cpu_relax();
 	}
 	return 0;
 }
 static void rollback_packet(struct kernel_queue *kq)
 {
 	BUG_ON(!kq);
 	kq->pending_wptr = *kq->queue->properties.write_ptr;
 }
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 					enum kfd_queue_type type)
 {
 	struct kernel_queue *kq;
 	BUG_ON(!dev);
 	kq = kzalloc(sizeof(struct kernel_queue), GFP_KERNEL);
 	if (!kq)
 		return NULL;
 	kq->initialize = initialize;
 	kq->uninitialize = uninitialize;
 	kq->acquire_packet_buffer = acquire_packet_buffer;
 	kq->submit_packet = submit_packet;
 	kq->sync_with_hw = sync_with_hw;
 	kq->rollback_packet = rollback_packet;
 	if (kq->initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE) == false) {
 		pr_err("kfd: failed to init kernel queue\n");
 		kfree(kq);
 		return NULL;
 	}
 	return kq;
 }
 void kernel_queue_uninit(struct kernel_queue *kq)
 {
 	BUG_ON(!kq);
 	kq->uninitialize(kq);
 	kfree(kq);
 }
 void test_kq(struct kfd_dev *dev)
 {
 	struct kernel_queue *kq;
 	uint32_t *buffer, i;
 	int retval;
 	BUG_ON(!dev);
 	pr_debug("kfd: starting kernel queue test\n");
 	kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ);
 	BUG_ON(!kq);
 	retval = kq->acquire_packet_buffer(kq, 5, &buffer);
 	BUG_ON(retval != 0);
 	for (i = 0; i < 5; i++)
 		buffer[i] = kq->nop_packet;
 	kq->submit_packet(kq);
 	kq->sync_with_hw(kq, 1000);
 	pr_debug("kfd: ending kernel queue test\n");
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
@ -0,0 +1,69 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #ifndef KFD_KERNEL_QUEUE_H_
 #define KFD_KERNEL_QUEUE_H_
 #include <linux/list.h>
 #include <linux/types.h>
 #include "kfd_priv.h"
 struct kernel_queue {
 	/* interface */
 	bool	(*initialize)(struct kernel_queue *kq, struct kfd_dev *dev,
 			enum kfd_queue_type type, unsigned int queue_size);
 	void	(*uninitialize)(struct kernel_queue *kq);
 	int	(*acquire_packet_buffer)(struct kernel_queue *kq,
 					size_t packet_size_in_dwords,
 					unsigned int **buffer_ptr);
 	void	(*submit_packet)(struct kernel_queue *kq);
 	int	(*sync_with_hw)(struct kernel_queue *kq,
 				unsigned long timeout_ms);
 	void	(*rollback_packet)(struct kernel_queue *kq);
 	/* data */
 	struct kfd_dev		*dev;
 	struct mqd_manager	*mqd;
 	struct queue		*queue;
 	uint32_t		pending_wptr;
 	unsigned int		nop_packet;
 	struct kfd_mem_obj	*rptr_mem;
 	uint32_t		*rptr_kernel;
 	uint64_t		rptr_gpu_addr;
 	struct kfd_mem_obj	*wptr_mem;
 	uint32_t		*wptr_kernel;
 	uint64_t		wptr_gpu_addr;
 	struct kfd_mem_obj	*pq;
 	uint64_t		pq_gpu_addr;
 	uint32_t		*pq_kernel_addr;
 	struct kfd_mem_obj	*fence_mem_obj;
 	uint64_t		fence_gpu_addr;
 	void			*fence_kernel_address;
 	struct list_head	list;
 };
 #endif /* KFD_KERNEL_QUEUE_H_ */
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@ -0,0 +1,159 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include "kfd_priv.h"
 #define KFD_DRIVER_AUTHOR	"AMD Inc. and others"
 #define KFD_DRIVER_DESC		"Standalone HSA driver for AMD's GPUs"
 #define KFD_DRIVER_DATE		"20141113"
 #define KFD_DRIVER_MAJOR	0
 #define KFD_DRIVER_MINOR	7
 #define KFD_DRIVER_PATCHLEVEL	0
 const struct kfd2kgd_calls *kfd2kgd;
 static const struct kgd2kfd_calls kgd2kfd = {
 	.exit		= kgd2kfd_exit,
 	.probe		= kgd2kfd_probe,
 	.device_init	= kgd2kfd_device_init,
 	.device_exit	= kgd2kfd_device_exit,
 	.interrupt	= kgd2kfd_interrupt,
 	.suspend	= kgd2kfd_suspend,
 	.resume		= kgd2kfd_resume,
 };
 int sched_policy = KFD_SCHED_POLICY_HWS;
 module_param(sched_policy, int, 0444);
 MODULE_PARM_DESC(sched_policy,
 	"Kernel cmdline parameter that defines the amdkfd scheduling policy");
 int max_num_of_processes = KFD_MAX_NUM_OF_PROCESSES_DEFAULT;
 module_param(max_num_of_processes, int, 0444);
 MODULE_PARM_DESC(max_num_of_processes,
 	"Kernel cmdline parameter that defines the amdkfd maximum number of supported processes");
 int max_num_of_queues_per_process = KFD_MAX_NUM_OF_QUEUES_PER_PROCESS_DEFAULT;
 module_param(max_num_of_queues_per_process, int, 0444);
 MODULE_PARM_DESC(max_num_of_queues_per_process,
 	"Kernel cmdline parameter that defines the amdkfd maximum number of supported queues per process");
 bool kgd2kfd_init(unsigned interface_version,
 		  const struct kfd2kgd_calls *f2g,
 		  const struct kgd2kfd_calls **g2f)
 {
 	/*
 	 * Only one interface version is supported,
 	 * no kfd/kgd version skew allowed.
 	 */
 	if (interface_version != KFD_INTERFACE_VERSION)
 		return false;
 	/* Protection against multiple amd kgd loads */
 	if (kfd2kgd)
 		return true;
 	kfd2kgd = f2g;
 	*g2f = &kgd2kfd;
 	return true;
 }
 EXPORT_SYMBOL(kgd2kfd_init);
 void kgd2kfd_exit(void)
 {
 }
 static int __init kfd_module_init(void)
 {
 	int err;
 	kfd2kgd = NULL;
 	/* Verify module parameters */
 	if ((sched_policy < KFD_SCHED_POLICY_HWS) ||
 		(sched_policy > KFD_SCHED_POLICY_NO_HWS)) {
 		pr_err("kfd: sched_policy has invalid value\n");
 		return -1;
 	}
 	/* Verify module parameters */
 	if ((max_num_of_processes < 0) ||
 		(max_num_of_processes > KFD_MAX_NUM_OF_PROCESSES)) {
 		pr_err("kfd: max_num_of_processes must be between 0 to KFD_MAX_NUM_OF_PROCESSES\n");
 		return -1;
 	}
 	if ((max_num_of_queues_per_process < 0) ||
 		(max_num_of_queues_per_process >
 			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)) {
 		pr_err("kfd: max_num_of_queues_per_process must be between 0 to KFD_MAX_NUM_OF_QUEUES_PER_PROCESS\n");
 		return -1;
 	}
 	err = kfd_pasid_init();
 	if (err < 0)
 		goto err_pasid;
 	err = kfd_chardev_init();
 	if (err < 0)
 		goto err_ioctl;
 	err = kfd_topology_init();
 	if (err < 0)
 		goto err_topology;
 	kfd_process_create_wq();
 	dev_info(kfd_device, "Initialized module\n");
 	return 0;
 err_topology:
 	kfd_chardev_exit();
 err_ioctl:
 	kfd_pasid_exit();
 err_pasid:
 	return err;
 }
 static void __exit kfd_module_exit(void)
 {
 	kfd_process_destroy_wq();
 	kfd_topology_shutdown();
 	kfd_chardev_exit();
 	kfd_pasid_exit();
 	dev_info(kfd_device, "Removed module\n");
 }
 module_init(kfd_module_init);
 module_exit(kfd_module_exit);
 MODULE_AUTHOR(KFD_DRIVER_AUTHOR);
 MODULE_DESCRIPTION(KFD_DRIVER_DESC);
 MODULE_LICENSE("GPL and additional rights");
 MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "."
 	       __stringify(KFD_DRIVER_MINOR) "."
 	       __stringify(KFD_DRIVER_PATCHLEVEL));
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@ -0,0 +1,346 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 #include "cik_regs.h"
 #include "../../radeon/cik_reg.h"
 inline void busy_wait(unsigned long ms)
 {
 	while (time_before(jiffies, ms))
 		cpu_relax();
 }
 static inline struct cik_mqd *get_mqd(void *mqd)
 {
 	return (struct cik_mqd *)mqd;
 }
 static int init_mqd(struct mqd_manager *mm, void **mqd,
 		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
 		struct queue_properties *q)
 {
 	uint64_t addr;
 	struct cik_mqd *m;
 	int retval;
 	BUG_ON(!mm || !q || !mqd);
 	pr_debug("kfd: In func %s\n", __func__);
 	retval = kfd2kgd->allocate_mem(mm->dev->kgd,
 					sizeof(struct cik_mqd),
 					256,
 					KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
 					(struct kgd_mem **) mqd_mem_obj);
 	if (retval != 0)
 		return -ENOMEM;
 	m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
 	addr = (*mqd_mem_obj)->gpu_addr;
 	memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));
 	m->header = 0xC0310800;
 	m->compute_pipelinestat_enable = 1;
 	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
 	/*
 	 * Make sure to use the last queue state saved on mqd when the cp
 	 * reassigns the queue, so when queue is switched on/off (e.g over
 	 * subscription or quantum timeout) the context will be consistent
 	 */
 	m->cp_hqd_persistent_state =
 				DEFAULT_CP_HQD_PERSISTENT_STATE | PRELOAD_REQ;
 	m->cp_mqd_control             = MQD_CONTROL_PRIV_STATE_EN;
 	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
 	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);
 	m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN;
 	/* Although WinKFD writes this, I suspect it should not be necessary */
 	m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE;
 	m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
 				QUANTUM_DURATION(10);
 	/*
 	 * Pipe Priority
 	 * Identifies the pipe relative priority when this queue is connected
 	 * to the pipeline. The pipe priority is against the GFX pipe and HP3D.
 	 * In KFD we are using a fixed pipe priority set to CS_MEDIUM.
 	 * 0 = CS_LOW (typically below GFX)
 	 * 1 = CS_MEDIUM (typically between HP3D and GFX
 	 * 2 = CS_HIGH (typically above HP3D)
 	 */
 	m->cp_hqd_pipe_priority = 1;
 	m->cp_hqd_queue_priority = 15;
 	*mqd = m;
 	if (gart_addr != NULL)
 		*gart_addr = addr;
 	retval = mm->update_mqd(mm, m, q);
 	return retval;
 }
 static void uninit_mqd(struct mqd_manager *mm, void *mqd,
 			struct kfd_mem_obj *mqd_mem_obj)
 {
 	BUG_ON(!mm || !mqd);
 	kfd2kgd->free_mem(mm->dev->kgd, (struct kgd_mem *) mqd_mem_obj);
 }
 static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
 			uint32_t queue_id, uint32_t __user *wptr)
 {
 	return kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, wptr);
 }
 static int update_mqd(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q)
 {
 	struct cik_mqd *m;
 	BUG_ON(!mm || !q || !mqd);
 	pr_debug("kfd: In func %s\n", __func__);
 	m = get_mqd(mqd);
 	m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE |
 				DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN;
 	/*
 	 * Calculating queue size which is log base 2 of actual queue size -1
 	 * dwords and another -1 for ffs
 	 */
 	m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
 								- 1 - 1;
 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
 	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
 	m->cp_hqd_pq_doorbell_control = DOORBELL_EN |
 					DOORBELL_OFFSET(q->doorbell_off);
 	m->cp_hqd_vmid = q->vmid;
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_iq_rptr = AQL_ENABLE;
 		m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
 	}
 	m->cp_hqd_active = 0;
 	q->is_active = false;
 	if (q->queue_size > 0 &&
 			q->queue_address != 0 &&
 			q->queue_percent > 0) {
 		m->cp_hqd_active = 1;
 		q->is_active = true;
 	}
 	return 0;
 }
 static int destroy_mqd(struct mqd_manager *mm, void *mqd,
 			enum kfd_preempt_type type,
 			unsigned int timeout, uint32_t pipe_id,
 			uint32_t queue_id)
 {
 	return kfd2kgd->hqd_destroy(mm->dev->kgd, type, timeout,
 					pipe_id, queue_id);
 }
 bool is_occupied(struct mqd_manager *mm, void *mqd,
 		uint64_t queue_address,	uint32_t pipe_id,
 		uint32_t queue_id)
 {
 	return kfd2kgd->hqd_is_occupies(mm->dev->kgd, queue_address,
 					pipe_id, queue_id);
 }
 /*
 * HIQ MQD Implementation, concrete implementation for HIQ MQD implementation.
 * The HIQ queue in Kaveri is using the same MQD structure as all the user mode
 * queues but with different initial values.
 */
 static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
 		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
 		struct queue_properties *q)
 {
 	uint64_t addr;
 	struct cik_mqd *m;
 	int retval;
 	BUG_ON(!mm || !q || !mqd || !mqd_mem_obj);
 	pr_debug("kfd: In func %s\n", __func__);
 	retval = kfd2kgd->allocate_mem(mm->dev->kgd,
 					sizeof(struct cik_mqd),
 					256,
 					KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
 					(struct kgd_mem **) mqd_mem_obj);
 	if (retval != 0)
 		return -ENOMEM;
 	m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
 	addr = (*mqd_mem_obj)->gpu_addr;
 	memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));
 	m->header = 0xC0310800;
 	m->compute_pipelinestat_enable = 1;
 	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
 	m->cp_hqd_persistent_state = DEFAULT_CP_HQD_PERSISTENT_STATE |
 					PRELOAD_REQ;
 	m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
 				QUANTUM_DURATION(10);
 	m->cp_mqd_control             = MQD_CONTROL_PRIV_STATE_EN;
 	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
 	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);
 	m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE;
 	/*
 	 * Pipe Priority
 	 * Identifies the pipe relative priority when this queue is connected
 	 * to the pipeline. The pipe priority is against the GFX pipe and HP3D.
 	 * In KFD we are using a fixed pipe priority set to CS_MEDIUM.
 	 * 0 = CS_LOW (typically below GFX)
 	 * 1 = CS_MEDIUM (typically between HP3D and GFX
 	 * 2 = CS_HIGH (typically above HP3D)
 	 */
 	m->cp_hqd_pipe_priority = 1;
 	m->cp_hqd_queue_priority = 15;
 	*mqd = m;
 	if (gart_addr)
 		*gart_addr = addr;
 	retval = mm->update_mqd(mm, m, q);
 	return retval;
 }
 static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 				struct queue_properties *q)
 {
 	struct cik_mqd *m;
 	BUG_ON(!mm || !q || !mqd);
 	pr_debug("kfd: In func %s\n", __func__);
 	m = get_mqd(mqd);
 	m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE |
 				DEFAULT_MIN_AVAIL_SIZE |
 				PRIV_STATE |
 				KMD_QUEUE;
 	/*
 	 * Calculating queue size which is log base 2 of actual queue
 	 * size -1 dwords
 	 */
 	m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
 								- 1 - 1;
 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
 	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
 	m->cp_hqd_pq_doorbell_control = DOORBELL_EN |
 					DOORBELL_OFFSET(q->doorbell_off);
 	m->cp_hqd_vmid = q->vmid;
 	m->cp_hqd_active = 0;
 	q->is_active = false;
 	if (q->queue_size > 0 &&
 			q->queue_address != 0 &&
 			q->queue_percent > 0) {
 		m->cp_hqd_active = 1;
 		q->is_active = true;
 	}
 	return 0;
 }
 struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
 					struct kfd_dev *dev)
 {
 	struct mqd_manager *mqd;
 	BUG_ON(!dev);
 	BUG_ON(type >= KFD_MQD_TYPE_MAX);
 	pr_debug("kfd: In func %s\n", __func__);
 	mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL);
 	if (!mqd)
 		return NULL;
 	mqd->dev = dev;
 	switch (type) {
 	case KFD_MQD_TYPE_CIK_CP:
 	case KFD_MQD_TYPE_CIK_COMPUTE:
 		mqd->init_mqd = init_mqd;
 		mqd->uninit_mqd = uninit_mqd;
 		mqd->load_mqd = load_mqd;
 		mqd->update_mqd = update_mqd;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
 		break;
 	case KFD_MQD_TYPE_CIK_HIQ:
 		mqd->init_mqd = init_mqd_hiq;
 		mqd->uninit_mqd = uninit_mqd;
 		mqd->load_mqd = load_mqd;
 		mqd->update_mqd = update_mqd_hiq;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
 		break;
 	default:
 		kfree(mqd);
 		return NULL;
 	}
 	return mqd;
 }
 /* SDMA queues should be implemented here when the cp will supports them */
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@ -0,0 +1,91 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #ifndef KFD_MQD_MANAGER_H_
 #define KFD_MQD_MANAGER_H_
 #include "kfd_priv.h"
 /**
 * struct mqd_manager
 *
 * @init_mqd: Allocates the mqd buffer on local gpu memory and initialize it.
 *
 * @load_mqd: Loads the mqd to a concrete hqd slot. Used only for no cp
 * scheduling mode.
 *
 * @update_mqd: Handles a update call for the MQD
 *
 * @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue.
 * Used only for no cp scheduling.
 *
 * @uninit_mqd: Releases the mqd buffer from local gpu memory.
 *
 * @is_occupied: Checks if the relevant HQD slot is occupied.
 *
 * @mqd_mutex: Mqd manager mutex.
 *
 * @dev: The kfd device structure coupled with this module.
 *
 * MQD stands for Memory Queue Descriptor which represents the current queue
 * state in the memory and initiate the HQD (Hardware Queue Descriptor) state.
 * This structure is actually a base class for the different types of MQDs
 * structures for the variant ASICs that should be supported in the future.
 * This base class is also contains all the MQD specific operations.
 * Another important thing to mention is that each queue has a MQD that keeps
 * his state (or context) after each preemption or reassignment.
 * Basically there are a instances of the mqd manager class per MQD type per
 * ASIC. Currently the kfd driver supports only Kaveri so there are instances
 * per KFD_MQD_TYPE for each device.
 *
 */
 struct mqd_manager {
 	int	(*init_mqd)(struct mqd_manager *mm, void **mqd,
 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q);
 	int	(*load_mqd)(struct mqd_manager *mm, void *mqd,
 				uint32_t pipe_id, uint32_t queue_id,
 				uint32_t __user *wptr);
 	int	(*update_mqd)(struct mqd_manager *mm, void *mqd,
 				struct queue_properties *q);
 	int	(*destroy_mqd)(struct mqd_manager *mm, void *mqd,
 				enum kfd_preempt_type type,
 				unsigned int timeout, uint32_t pipe_id,
 				uint32_t queue_id);
 	void	(*uninit_mqd)(struct mqd_manager *mm, void *mqd,
 				struct kfd_mem_obj *mqd_mem_obj);
 	bool	(*is_occupied)(struct mqd_manager *mm, void *mqd,
 				uint64_t queue_address,	uint32_t pipe_id,
 				uint32_t queue_id);
 	struct mutex	mqd_mutex;
 	struct kfd_dev	*dev;
 };
 #endif /* KFD_MQD_MANAGER_H_ */
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@ -0,0 +1,565 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include "kfd_device_queue_manager.h"
 #include "kfd_kernel_queue.h"
 #include "kfd_priv.h"
 #include "kfd_pm4_headers.h"
 #include "kfd_pm4_opcodes.h"
 static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
 				unsigned int buffer_size_bytes)
 {
 	unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t);
 	BUG_ON((temp * sizeof(uint32_t)) > buffer_size_bytes);
 	*wptr = temp;
 }
 static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size)
 {
 	union PM4_MES_TYPE_3_HEADER header;
 	header.u32all = 0;
 	header.opcode = opcode;
 	header.count = packet_size/sizeof(uint32_t) - 2;
 	header.type = PM4_TYPE_3;
 	return header.u32all;
 }
 static void pm_calc_rlib_size(struct packet_manager *pm,
 				unsigned int *rlib_size,
 				bool *over_subscription)
 {
 	unsigned int process_count, queue_count;
 	BUG_ON(!pm || !rlib_size || !over_subscription);
 	process_count = pm->dqm->processes_count;
 	queue_count = pm->dqm->queue_count;
 	/* check if there is over subscription*/
 	*over_subscription = false;
 	if ((process_count > 1) ||
 		queue_count > PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE) {
 		*over_subscription = true;
 		pr_debug("kfd: over subscribed runlist\n");
 	}
 	/* calculate run list ib allocation size */
 	*rlib_size = process_count * sizeof(struct pm4_map_process) +
 		     queue_count * sizeof(struct pm4_map_queues);
 	/*
 	 * Increase the allocation size in case we need a chained run list
 	 * when over subscription
 	 */
 	if (*over_subscription)
 		*rlib_size += sizeof(struct pm4_runlist);
 	pr_debug("kfd: runlist ib size %d\n", *rlib_size);
 }
 static int pm_allocate_runlist_ib(struct packet_manager *pm,
 				unsigned int **rl_buffer,
 				uint64_t *rl_gpu_buffer,
 				unsigned int *rl_buffer_size,
 				bool *is_over_subscription)
 {
 	int retval;
 	BUG_ON(!pm);
 	BUG_ON(pm->allocated == true);
 	BUG_ON(is_over_subscription == NULL);
 	pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
 	retval = kfd2kgd->allocate_mem(pm->dqm->dev->kgd,
 					*rl_buffer_size,
 					PAGE_SIZE,
 					KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
 					(struct kgd_mem **) &pm->ib_buffer_obj);
 	if (retval != 0) {
 		pr_err("kfd: failed to allocate runlist IB\n");
 		return retval;
 	}
 	*(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr;
 	*rl_gpu_buffer = pm->ib_buffer_obj->gpu_addr;
 	memset(*rl_buffer, 0, *rl_buffer_size);
 	pm->allocated = true;
 	return retval;
 }
 static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
 			uint64_t ib, size_t ib_size_in_dwords, bool chain)
 {
 	struct pm4_runlist *packet;
 	BUG_ON(!pm || !buffer || !ib);
 	packet = (struct pm4_runlist *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_runlist));
 	packet->header.u32all = build_pm4_header(IT_RUN_LIST,
 						sizeof(struct pm4_runlist));
 	packet->bitfields4.ib_size = ib_size_in_dwords;
 	packet->bitfields4.chain = chain ? 1 : 0;
 	packet->bitfields4.offload_polling = 0;
 	packet->bitfields4.valid = 1;
 	packet->ordinal2 = lower_32_bits(ib);
 	packet->bitfields3.ib_base_hi = upper_32_bits(ib);
 	return 0;
 }
 static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer,
 				struct qcm_process_device *qpd)
 {
 	struct pm4_map_process *packet;
 	struct queue *cur;
 	uint32_t num_queues;
 	BUG_ON(!pm || !buffer || !qpd);
 	packet = (struct pm4_map_process *)buffer;
 	pr_debug("kfd: In func %s\n", __func__);
 	memset(buffer, 0, sizeof(struct pm4_map_process));
 	packet->header.u32all = build_pm4_header(IT_MAP_PROCESS,
 					sizeof(struct pm4_map_process));
 	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
 	packet->bitfields2.process_quantum = 1;
 	packet->bitfields2.pasid = qpd->pqm->process->pasid;
 	packet->bitfields3.page_table_base = qpd->page_table_base;
 	packet->bitfields10.gds_size = qpd->gds_size;
 	packet->bitfields10.num_gws = qpd->num_gws;
 	packet->bitfields10.num_oac = qpd->num_oac;
 	num_queues = 0;
 	list_for_each_entry(cur, &qpd->queues_list, list)
 		num_queues++;
 	packet->bitfields10.num_queues = num_queues;
 	packet->sh_mem_config = qpd->sh_mem_config;
 	packet->sh_mem_bases = qpd->sh_mem_bases;
 	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
 	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
 	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
 	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
 	return 0;
 }
 static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer,
 				struct queue *q)
 {
 	struct pm4_map_queues *packet;
 	BUG_ON(!pm || !buffer || !q);
 	pr_debug("kfd: In func %s\n", __func__);
 	packet = (struct pm4_map_queues *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_map_queues));
 	packet->header.u32all = build_pm4_header(IT_MAP_QUEUES,
 						sizeof(struct pm4_map_queues));
 	packet->bitfields2.alloc_format =
 				alloc_format__mes_map_queues__one_per_pipe;
 	packet->bitfields2.num_queues = 1;
 	packet->bitfields2.queue_sel =
 		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots;
 	packet->bitfields2.vidmem = (q->properties.is_interop) ?
 			vidmem__mes_map_queues__uses_video_memory :
 			vidmem__mes_map_queues__uses_no_video_memory;
 	switch (q->properties.type) {
 	case KFD_QUEUE_TYPE_COMPUTE:
 	case KFD_QUEUE_TYPE_DIQ:
 		packet->bitfields2.engine_sel =
 				engine_sel__mes_map_queues__compute;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
 		packet->bitfields2.engine_sel =
 				engine_sel__mes_map_queues__sdma0;
 		break;
 	default:
 		BUG();
 		break;
 	}
 	packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset =
 			q->properties.doorbell_off;
 	packet->mes_map_queues_ordinals[0].mqd_addr_lo =
 			lower_32_bits(q->gart_mqd_addr);
 	packet->mes_map_queues_ordinals[0].mqd_addr_hi =
 			upper_32_bits(q->gart_mqd_addr);
 	packet->mes_map_queues_ordinals[0].wptr_addr_lo =
 			lower_32_bits((uint64_t)q->properties.write_ptr);
 	packet->mes_map_queues_ordinals[0].wptr_addr_hi =
 			upper_32_bits((uint64_t)q->properties.write_ptr);
 	return 0;
 }
 static int pm_create_runlist_ib(struct packet_manager *pm,
 				struct list_head *queues,
 				uint64_t *rl_gpu_addr,
 				size_t *rl_size_bytes)
 {
 	unsigned int alloc_size_bytes;
 	unsigned int *rl_buffer, rl_wptr, i;
 	int retval, proccesses_mapped;
 	struct device_process_node *cur;
 	struct qcm_process_device *qpd;
 	struct queue *q;
 	struct kernel_queue *kq;
 	bool is_over_subscription;
 	BUG_ON(!pm || !queues || !rl_size_bytes || !rl_gpu_addr);
 	rl_wptr = retval = proccesses_mapped = 0;
 	retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr,
 				&alloc_size_bytes, &is_over_subscription);
 	if (retval != 0)
 		return retval;
 	*rl_size_bytes = alloc_size_bytes;
 	pr_debug("kfd: In func %s\n", __func__);
 	pr_debug("kfd: building runlist ib process count: %d queues count %d\n",
 		pm->dqm->processes_count, pm->dqm->queue_count);
 	/* build the run list ib packet */
 	list_for_each_entry(cur, queues, list) {
 		qpd = cur->qpd;
 		/* build map process packet */
 		if (proccesses_mapped >= pm->dqm->processes_count) {
 			pr_debug("kfd: not enough space left in runlist IB\n");
 			pm_release_ib(pm);
 			return -ENOMEM;
 		}
 		retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd);
 		if (retval != 0)
 			return retval;
 		proccesses_mapped++;
 		inc_wptr(&rl_wptr, sizeof(struct pm4_map_process),
 				alloc_size_bytes);
 		list_for_each_entry(kq, &qpd->priv_queue_list, list) {
 			if (kq->queue->properties.is_active != true)
 				continue;
 			retval = pm_create_map_queue(pm, &rl_buffer[rl_wptr],
 							kq->queue);
 			if (retval != 0)
 				return retval;
 			inc_wptr(&rl_wptr, sizeof(struct pm4_map_queues),
 					alloc_size_bytes);
 		}
 		list_for_each_entry(q, &qpd->queues_list, list) {
 			if (q->properties.is_active != true)
 				continue;
 			retval = pm_create_map_queue(pm,
 						&rl_buffer[rl_wptr], q);
 			if (retval != 0)
 				return retval;
 			inc_wptr(&rl_wptr, sizeof(struct pm4_map_queues),
 					alloc_size_bytes);
 		}
 	}
 	pr_debug("kfd: finished map process and queues to runlist\n");
 	if (is_over_subscription)
 		pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr,
 				alloc_size_bytes / sizeof(uint32_t), true);
 	for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
 		pr_debug("0x%2X ", rl_buffer[i]);
 	pr_debug("\n");
 	return 0;
 }
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
 {
 	BUG_ON(!dqm);
 	pm->dqm = dqm;
 	mutex_init(&pm->lock);
 	pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ);
 	if (pm->priv_queue == NULL) {
 		mutex_destroy(&pm->lock);
 		return -ENOMEM;
 	}
 	pm->allocated = false;
 	return 0;
 }
 void pm_uninit(struct packet_manager *pm)
 {
 	BUG_ON(!pm);
 	mutex_destroy(&pm->lock);
 	kernel_queue_uninit(pm->priv_queue);
 }
 int pm_send_set_resources(struct packet_manager *pm,
 				struct scheduling_resources *res)
 {
 	struct pm4_set_resources *packet;
 	BUG_ON(!pm || !res);
 	pr_debug("kfd: In func %s\n", __func__);
 	mutex_lock(&pm->lock);
 	pm->priv_queue->acquire_packet_buffer(pm->priv_queue,
 					sizeof(*packet) / sizeof(uint32_t),
 			(unsigned int **)&packet);
 	if (packet == NULL) {
 		mutex_unlock(&pm->lock);
 		pr_err("kfd: failed to allocate buffer on kernel queue\n");
 		return -ENOMEM;
 	}
 	memset(packet, 0, sizeof(struct pm4_set_resources));
 	packet->header.u32all = build_pm4_header(IT_SET_RESOURCES,
 					sizeof(struct pm4_set_resources));
 	packet->bitfields2.queue_type =
 			queue_type__mes_set_resources__hsa_interface_queue_hiq;
 	packet->bitfields2.vmid_mask = res->vmid_mask;
 	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY;
 	packet->bitfields7.oac_mask = res->oac_mask;
 	packet->bitfields8.gds_heap_base = res->gds_heap_base;
 	packet->bitfields8.gds_heap_size = res->gds_heap_size;
 	packet->gws_mask_lo = lower_32_bits(res->gws_mask);
 	packet->gws_mask_hi = upper_32_bits(res->gws_mask);
 	packet->queue_mask_lo = lower_32_bits(res->queue_mask);
 	packet->queue_mask_hi = upper_32_bits(res->queue_mask);
 	pm->priv_queue->submit_packet(pm->priv_queue);
 	pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT);
 	mutex_unlock(&pm->lock);
 	return 0;
 }
 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
 {
 	uint64_t rl_gpu_ib_addr;
 	uint32_t *rl_buffer;
 	size_t rl_ib_size, packet_size_dwords;
 	int retval;
 	BUG_ON(!pm || !dqm_queues);
 	retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr,
 					&rl_ib_size);
 	if (retval != 0)
 		goto fail_create_runlist_ib;
 	pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr);
 	packet_size_dwords = sizeof(struct pm4_runlist) / sizeof(uint32_t);
 	mutex_lock(&pm->lock);
 	retval = pm->priv_queue->acquire_packet_buffer(pm->priv_queue,
 					packet_size_dwords, &rl_buffer);
 	if (retval != 0)
 		goto fail_acquire_packet_buffer;
 	retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr,
 					rl_ib_size / sizeof(uint32_t), false);
 	if (retval != 0)
 		goto fail_create_runlist;
 	pm->priv_queue->submit_packet(pm->priv_queue);
 	pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT);
 	mutex_unlock(&pm->lock);
 	return retval;
 fail_create_runlist:
 	pm->priv_queue->rollback_packet(pm->priv_queue);
 fail_acquire_packet_buffer:
 	mutex_unlock(&pm->lock);
 fail_create_runlist_ib:
 	if (pm->allocated == true)
 		pm_release_ib(pm);
 	return retval;
 }
 int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
 			uint32_t fence_value)
 {
 	int retval;
 	struct pm4_query_status *packet;
 	BUG_ON(!pm || !fence_address);
 	mutex_lock(&pm->lock);
 	retval = pm->priv_queue->acquire_packet_buffer(
 			pm->priv_queue,
 			sizeof(struct pm4_query_status) / sizeof(uint32_t),
 			(unsigned int **)&packet);
 	if (retval != 0)
 		goto fail_acquire_packet_buffer;
 	packet->header.u32all = build_pm4_header(IT_QUERY_STATUS,
 					sizeof(struct pm4_query_status));
 	packet->bitfields2.context_id = 0;
 	packet->bitfields2.interrupt_sel =
 			interrupt_sel__mes_query_status__completion_status;
 	packet->bitfields2.command =
 			command__mes_query_status__fence_only_after_write_ack;
 	packet->addr_hi = upper_32_bits((uint64_t)fence_address);
 	packet->addr_lo = lower_32_bits((uint64_t)fence_address);
 	packet->data_hi = upper_32_bits((uint64_t)fence_value);
 	packet->data_lo = lower_32_bits((uint64_t)fence_value);
 	pm->priv_queue->submit_packet(pm->priv_queue);
 	pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT);
 	mutex_unlock(&pm->lock);
 	return 0;
 fail_acquire_packet_buffer:
 	mutex_unlock(&pm->lock);
 	return retval;
 }
 int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
 			enum kfd_preempt_type_filter mode,
 			uint32_t filter_param, bool reset,
 			unsigned int sdma_engine)
 {
 	int retval;
 	uint32_t *buffer;
 	struct pm4_unmap_queues *packet;
 	BUG_ON(!pm);
 	mutex_lock(&pm->lock);
 	retval = pm->priv_queue->acquire_packet_buffer(
 			pm->priv_queue,
 			sizeof(struct pm4_unmap_queues) / sizeof(uint32_t),
 			&buffer);
 	if (retval != 0)
 		goto err_acquire_packet_buffer;
 	packet = (struct pm4_unmap_queues *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_unmap_queues));
 	packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES,
 					sizeof(struct pm4_unmap_queues));
 	switch (type) {
 	case KFD_QUEUE_TYPE_COMPUTE:
 	case KFD_QUEUE_TYPE_DIQ:
 		packet->bitfields2.engine_sel =
 			engine_sel__mes_unmap_queues__compute;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
 		packet->bitfields2.engine_sel =
 			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
 		break;
 	default:
 		BUG();
 		break;
 	}
 	if (reset)
 		packet->bitfields2.action =
 				action__mes_unmap_queues__reset_queues;
 	else
 		packet->bitfields2.action =
 				action__mes_unmap_queues__preempt_queues;
 	switch (mode) {
 	case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE:
 		packet->bitfields2.queue_sel =
 				queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
 		packet->bitfields2.num_queues = 1;
 		packet->bitfields3b.doorbell_offset0 = filter_param;
 		break;
 	case KFD_PREEMPT_TYPE_FILTER_BY_PASID:
 		packet->bitfields2.queue_sel =
 				queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
 		packet->bitfields3a.pasid = filter_param;
 		break;
 	case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES:
 		packet->bitfields2.queue_sel =
 				queue_sel__mes_unmap_queues__perform_request_on_all_active_queues;
 		break;
 	default:
 		BUG();
 		break;
 	};
 	pm->priv_queue->submit_packet(pm->priv_queue);
 	pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT);
 	mutex_unlock(&pm->lock);
 	return 0;
 err_acquire_packet_buffer:
 	mutex_unlock(&pm->lock);
 	return retval;
 }
 void pm_release_ib(struct packet_manager *pm)
 {
 	BUG_ON(!pm);
 	mutex_lock(&pm->lock);
 	if (pm->allocated) {
 		kfd2kgd->free_mem(pm->dqm->dev->kgd,
 				(struct kgd_mem *) pm->ib_buffer_obj);
 		pm->allocated = false;
 	}
 	mutex_unlock(&pm->lock);
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
@ -0,0 +1,97 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <linux/slab.h>
 #include <linux/types.h>
 #include "kfd_priv.h"
 static unsigned long *pasid_bitmap;
 static unsigned int pasid_limit;
 static DEFINE_MUTEX(pasid_mutex);
 int kfd_pasid_init(void)
 {
 	pasid_limit = max_num_of_processes;
 	pasid_bitmap = kzalloc(DIV_ROUND_UP(pasid_limit, BITS_PER_BYTE),
 				GFP_KERNEL);
 	if (!pasid_bitmap)
 		return -ENOMEM;
 	set_bit(0, pasid_bitmap); /* PASID 0 is reserved. */
 	return 0;
 }
 void kfd_pasid_exit(void)
 {
 	kfree(pasid_bitmap);
 }
 bool kfd_set_pasid_limit(unsigned int new_limit)
 {
 	if (new_limit < pasid_limit) {
 		bool ok;
 		mutex_lock(&pasid_mutex);
 		/* ensure that no pasids >= new_limit are in-use */
 		ok = (find_next_bit(pasid_bitmap, pasid_limit, new_limit) ==
 								pasid_limit);
 		if (ok)
 			pasid_limit = new_limit;
 		mutex_unlock(&pasid_mutex);
 		return ok;
 	}
 	return true;
 }
 inline unsigned int kfd_get_pasid_limit(void)
 {
 	return pasid_limit;
 }
 unsigned int kfd_pasid_alloc(void)
 {
 	unsigned int found;
 	mutex_lock(&pasid_mutex);
 	found = find_first_zero_bit(pasid_bitmap, pasid_limit);
 	if (found == pasid_limit)
 		found = 0;
 	else
 		set_bit(found, pasid_bitmap);
 	mutex_unlock(&pasid_mutex);
 	return found;
 }
 void kfd_pasid_free(unsigned int pasid)
 {
 	BUG_ON(pasid == 0 || pasid >= pasid_limit);
 	clear_bit(pasid, pasid_bitmap);
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
@ -0,0 +1,405 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #ifndef KFD_PM4_HEADERS_H_
 #define KFD_PM4_HEADERS_H_
 #ifndef PM4_MES_HEADER_DEFINED
 #define PM4_MES_HEADER_DEFINED
 union PM4_MES_TYPE_3_HEADER {
 	struct {
 		uint32_t reserved1:8;	/* < reserved */
 		uint32_t opcode:8;	/* < IT opcode */
 		uint32_t count:14;	/* < number of DWORDs - 1
 					 * in the information body.
 					 */
 		uint32_t type:2;	/* < packet identifier.
 					 * It should be 3 for type 3 packets
 					 */
 	};
 	uint32_t u32all;
 };
 #endif /* PM4_MES_HEADER_DEFINED */
 /* --------------------MES_SET_RESOURCES-------------------- */
 #ifndef PM4_MES_SET_RESOURCES_DEFINED
 #define PM4_MES_SET_RESOURCES_DEFINED
 enum set_resources_queue_type_enum {
 	queue_type__mes_set_resources__kernel_interface_queue_kiq = 0,
 	queue_type__mes_set_resources__hsa_interface_queue_hiq = 1,
 	queue_type__mes_set_resources__hsa_debug_interface_queue = 4
 };
 struct pm4_set_resources {
 	union {
 		union PM4_MES_TYPE_3_HEADER header;	/* header */
 		uint32_t ordinal1;
 	};
 	union {
 		struct {
 			uint32_t vmid_mask:16;
 			uint32_t unmap_latency:8;
 			uint32_t reserved1:5;
 			enum set_resources_queue_type_enum queue_type:3;
 		} bitfields2;
 		uint32_t ordinal2;
 	};
 	uint32_t queue_mask_lo;
 	uint32_t queue_mask_hi;
 	uint32_t gws_mask_lo;
 	uint32_t gws_mask_hi;
 	union {
 		struct {
 			uint32_t oac_mask:16;
 			uint32_t reserved2:16;
 		} bitfields7;
 		uint32_t ordinal7;
 	};
 	union {
 		struct {
 			uint32_t gds_heap_base:6;
 			uint32_t reserved3:5;
 			uint32_t gds_heap_size:6;
 			uint32_t reserved4:15;
 		} bitfields8;
 		uint32_t ordinal8;
 	};
 };
 #endif
 /*--------------------MES_RUN_LIST-------------------- */
 #ifndef PM4_MES_RUN_LIST_DEFINED
 #define PM4_MES_RUN_LIST_DEFINED
 struct pm4_runlist {
 	union {
 		union PM4_MES_TYPE_3_HEADER header;	/* header */
 		uint32_t ordinal1;
 	};
 	union {
 		struct {
 			uint32_t reserved1:2;
 			uint32_t ib_base_lo:30;
 		} bitfields2;
 		uint32_t ordinal2;
 	};
 	union {
 		struct {
 			uint32_t ib_base_hi:16;
 			uint32_t reserved2:16;
 		} bitfields3;
 		uint32_t ordinal3;
 	};
 	union {
 		struct {
 			uint32_t ib_size:20;
 			uint32_t chain:1;
 			uint32_t offload_polling:1;
 			uint32_t reserved3:1;
 			uint32_t valid:1;
 			uint32_t reserved4:8;
 		} bitfields4;
 		uint32_t ordinal4;
 	};
 };
 #endif
 /*--------------------MES_MAP_PROCESS-------------------- */
 #ifndef PM4_MES_MAP_PROCESS_DEFINED
 #define PM4_MES_MAP_PROCESS_DEFINED
 struct pm4_map_process {
 	union {
 		union PM4_MES_TYPE_3_HEADER header;	/* header */
 		uint32_t ordinal1;
 	};
 	union {
 		struct {
 			uint32_t pasid:16;
 			uint32_t reserved1:8;
 			uint32_t diq_enable:1;
 			uint32_t process_quantum:7;
 		} bitfields2;
 		uint32_t ordinal2;
 	};
 	union {
 		struct {
 			uint32_t page_table_base:28;
 			uint32_t reserved3:4;
 		} bitfields3;
 		uint32_t ordinal3;
 	};
 	uint32_t sh_mem_bases;
 	uint32_t sh_mem_ape1_base;
 	uint32_t sh_mem_ape1_limit;
 	uint32_t sh_mem_config;
 	uint32_t gds_addr_lo;
 	uint32_t gds_addr_hi;
 	union {
 		struct {
 			uint32_t num_gws:6;
 			uint32_t reserved4:2;
 			uint32_t num_oac:4;
 			uint32_t reserved5:4;
 			uint32_t gds_size:6;
 			uint32_t num_queues:10;
 		} bitfields10;
 		uint32_t ordinal10;
 	};
 };
 #endif
 /*--------------------MES_MAP_QUEUES--------------------*/
 #ifndef PM4_MES_MAP_QUEUES_DEFINED
 #define PM4_MES_MAP_QUEUES_DEFINED
 enum map_queues_queue_sel_enum {
 	queue_sel__mes_map_queues__map_to_specified_queue_slots = 0,
 	queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1,
 	queue_sel__mes_map_queues__enable_process_queues = 2
 };
 enum map_queues_vidmem_enum {
 	vidmem__mes_map_queues__uses_no_video_memory = 0,
 	vidmem__mes_map_queues__uses_video_memory = 1
 };
 enum map_queues_alloc_format_enum {
 	alloc_format__mes_map_queues__one_per_pipe = 0,
 	alloc_format__mes_map_queues__all_on_one_pipe = 1
 };
 enum map_queues_engine_sel_enum {
 	engine_sel__mes_map_queues__compute = 0,
 	engine_sel__mes_map_queues__sdma0 = 2,
 	engine_sel__mes_map_queues__sdma1 = 3
 };
 struct pm4_map_queues {
 	union {
 		union PM4_MES_TYPE_3_HEADER header;	/* header */
 		uint32_t ordinal1;
 	};
 	union {
 		struct {
 			uint32_t reserved1:4;
 			enum map_queues_queue_sel_enum queue_sel:2;
 			uint32_t reserved2:2;
 			uint32_t vmid:4;
 			uint32_t reserved3:4;
 			enum map_queues_vidmem_enum vidmem:2;
 			uint32_t reserved4:6;
 			enum map_queues_alloc_format_enum alloc_format:2;
 			enum map_queues_engine_sel_enum engine_sel:3;
 			uint32_t num_queues:3;
 		} bitfields2;
 		uint32_t ordinal2;
 	};
 	struct {
 		union {
 			struct {
 				uint32_t reserved5:2;
 				uint32_t doorbell_offset:21;
 				uint32_t reserved6:3;
 				uint32_t queue:6;
 			} bitfields3;
 			uint32_t ordinal3;
 		};
 		uint32_t mqd_addr_lo;
 		uint32_t mqd_addr_hi;
 		uint32_t wptr_addr_lo;
 		uint32_t wptr_addr_hi;
 	} mes_map_queues_ordinals[1];	/* 1..N of these ordinal groups */
 };
 #endif
 /*--------------------MES_QUERY_STATUS--------------------*/
 #ifndef PM4_MES_QUERY_STATUS_DEFINED
 #define PM4_MES_QUERY_STATUS_DEFINED
 enum query_status_interrupt_sel_enum {
 	interrupt_sel__mes_query_status__completion_status = 0,
 	interrupt_sel__mes_query_status__process_status = 1,
 	interrupt_sel__mes_query_status__queue_status = 2
 };
 enum query_status_command_enum {
 	command__mes_query_status__interrupt_only = 0,
 	command__mes_query_status__fence_only_immediate = 1,
 	command__mes_query_status__fence_only_after_write_ack = 2,
 	command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3
 };
 enum query_status_engine_sel_enum {
 	engine_sel__mes_query_status__compute = 0,
 	engine_sel__mes_query_status__sdma0_queue = 2,
 	engine_sel__mes_query_status__sdma1_queue = 3
 };
 struct pm4_query_status {
 	union {
 		union PM4_MES_TYPE_3_HEADER header;	/* header */
 		uint32_t ordinal1;
 	};
 	union {
 		struct {
 			uint32_t context_id:28;
 			enum query_status_interrupt_sel_enum interrupt_sel:2;
 			enum query_status_command_enum command:2;
 		} bitfields2;
 		uint32_t ordinal2;
 	};
 	union {
 		struct {
 			uint32_t pasid:16;
 			uint32_t reserved1:16;
 		} bitfields3a;
 		struct {
 			uint32_t reserved2:2;
 			uint32_t doorbell_offset:21;
 			uint32_t reserved3:3;
 			enum query_status_engine_sel_enum engine_sel:3;
 			uint32_t reserved4:3;
 		} bitfields3b;
 		uint32_t ordinal3;
 	};
 	uint32_t addr_lo;
 	uint32_t addr_hi;
 	uint32_t data_lo;
 	uint32_t data_hi;
 };
 #endif
 /*--------------------MES_UNMAP_QUEUES--------------------*/
 #ifndef PM4_MES_UNMAP_QUEUES_DEFINED
 #define PM4_MES_UNMAP_QUEUES_DEFINED
 enum unmap_queues_action_enum {
 	action__mes_unmap_queues__preempt_queues = 0,
 	action__mes_unmap_queues__reset_queues = 1,
 	action__mes_unmap_queues__disable_process_queues = 2
 };
 enum unmap_queues_queue_sel_enum {
 	queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0,
 	queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1,
 	queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2
 };
 enum unmap_queues_engine_sel_enum {
 	engine_sel__mes_unmap_queues__compute = 0,
 	engine_sel__mes_unmap_queues__sdma0 = 2,
 	engine_sel__mes_unmap_queues__sdma1 = 3
 };
 struct pm4_unmap_queues {
 	union {
 		union PM4_MES_TYPE_3_HEADER header;	/* header */
 		uint32_t ordinal1;
 	};
 	union {
 		struct {
 			enum unmap_queues_action_enum action:2;
 			uint32_t reserved1:2;
 			enum unmap_queues_queue_sel_enum queue_sel:2;
 			uint32_t reserved2:20;
 			enum unmap_queues_engine_sel_enum engine_sel:3;
 			uint32_t num_queues:3;
 		} bitfields2;
 		uint32_t ordinal2;
 	};
 	union {
 		struct {
 			uint32_t pasid:16;
 			uint32_t reserved3:16;
 		} bitfields3a;
 		struct {
 			uint32_t reserved4:2;
 			uint32_t doorbell_offset0:21;
 			uint32_t reserved5:9;
 		} bitfields3b;
 		uint32_t ordinal3;
 	};
 	union {
 		struct {
 			uint32_t reserved6:2;
 			uint32_t doorbell_offset1:21;
 			uint32_t reserved7:9;
 		} bitfields4;
 		uint32_t ordinal4;
 	};
 	union {
 		struct {
 			uint32_t reserved8:2;
 			uint32_t doorbell_offset2:21;
 			uint32_t reserved9:9;
 		} bitfields5;
 		uint32_t ordinal5;
 	};
 	union {
 		struct {
 			uint32_t reserved10:2;
 			uint32_t doorbell_offset3:21;
 			uint32_t reserved11:9;
 		} bitfields6;
 		uint32_t ordinal6;
 	};
 };
 #endif
 enum {
 	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
 };
 #endif /* KFD_PM4_HEADERS_H_ */
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h
@ -0,0 +1,107 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #ifndef KFD_PM4_OPCODES_H
 #define KFD_PM4_OPCODES_H
 enum it_opcode_type {
 	IT_NOP                               = 0x10,
 	IT_SET_BASE                          = 0x11,
 	IT_CLEAR_STATE                       = 0x12,
 	IT_INDEX_BUFFER_SIZE                 = 0x13,
 	IT_DISPATCH_DIRECT                   = 0x15,
 	IT_DISPATCH_INDIRECT                 = 0x16,
 	IT_ATOMIC_GDS                        = 0x1D,
 	IT_OCCLUSION_QUERY                   = 0x1F,
 	IT_SET_PREDICATION                   = 0x20,
 	IT_REG_RMW                           = 0x21,
 	IT_COND_EXEC                         = 0x22,
 	IT_PRED_EXEC                         = 0x23,
 	IT_DRAW_INDIRECT                     = 0x24,
 	IT_DRAW_INDEX_INDIRECT               = 0x25,
 	IT_INDEX_BASE                        = 0x26,
 	IT_DRAW_INDEX_2                      = 0x27,
 	IT_CONTEXT_CONTROL                   = 0x28,
 	IT_INDEX_TYPE                        = 0x2A,
 	IT_DRAW_INDIRECT_MULTI               = 0x2C,
 	IT_DRAW_INDEX_AUTO                   = 0x2D,
 	IT_NUM_INSTANCES                     = 0x2F,
 	IT_DRAW_INDEX_MULTI_AUTO             = 0x30,
 	IT_INDIRECT_BUFFER_CNST              = 0x33,
 	IT_STRMOUT_BUFFER_UPDATE             = 0x34,
 	IT_DRAW_INDEX_OFFSET_2               = 0x35,
 	IT_DRAW_PREAMBLE                     = 0x36,
 	IT_WRITE_DATA                        = 0x37,
 	IT_DRAW_INDEX_INDIRECT_MULTI         = 0x38,
 	IT_MEM_SEMAPHORE                     = 0x39,
 	IT_COPY_DW                           = 0x3B,
 	IT_WAIT_REG_MEM                      = 0x3C,
 	IT_INDIRECT_BUFFER                   = 0x3F,
 	IT_COPY_DATA                         = 0x40,
 	IT_PFP_SYNC_ME                       = 0x42,
 	IT_SURFACE_SYNC                      = 0x43,
 	IT_COND_WRITE                        = 0x45,
 	IT_EVENT_WRITE                       = 0x46,
 	IT_EVENT_WRITE_EOP                   = 0x47,
 	IT_EVENT_WRITE_EOS                   = 0x48,
 	IT_RELEASE_MEM                       = 0x49,
 	IT_PREAMBLE_CNTL                     = 0x4A,
 	IT_DMA_DATA                          = 0x50,
 	IT_ACQUIRE_MEM                       = 0x58,
 	IT_REWIND                            = 0x59,
 	IT_LOAD_UCONFIG_REG                  = 0x5E,
 	IT_LOAD_SH_REG                       = 0x5F,
 	IT_LOAD_CONFIG_REG                   = 0x60,
 	IT_LOAD_CONTEXT_REG                  = 0x61,
 	IT_SET_CONFIG_REG                    = 0x68,
 	IT_SET_CONTEXT_REG                   = 0x69,
 	IT_SET_CONTEXT_REG_INDIRECT          = 0x73,
 	IT_SET_SH_REG                        = 0x76,
 	IT_SET_SH_REG_OFFSET                 = 0x77,
 	IT_SET_QUEUE_REG                     = 0x78,
 	IT_SET_UCONFIG_REG                   = 0x79,
 	IT_SCRATCH_RAM_WRITE                 = 0x7D,
 	IT_SCRATCH_RAM_READ                  = 0x7E,
 	IT_LOAD_CONST_RAM                    = 0x80,
 	IT_WRITE_CONST_RAM                   = 0x81,
 	IT_DUMP_CONST_RAM                    = 0x83,
 	IT_INCREMENT_CE_COUNTER              = 0x84,
 	IT_INCREMENT_DE_COUNTER              = 0x85,
 	IT_WAIT_ON_CE_COUNTER                = 0x86,
 	IT_WAIT_ON_DE_COUNTER_DIFF           = 0x88,
 	IT_SWITCH_BUFFER                     = 0x8B,
 	IT_SET_RESOURCES                     = 0xA0,
 	IT_MAP_PROCESS                       = 0xA1,
 	IT_MAP_QUEUES                        = 0xA2,
 	IT_UNMAP_QUEUES                      = 0xA3,
 	IT_QUERY_STATUS                      = 0xA4,
 	IT_RUN_LIST                          = 0xA5,
 };
 #define PM4_TYPE_0 0
 #define PM4_TYPE_2 2
 #define PM4_TYPE_3 3
 #endif /* KFD_PM4_OPCODES_H */
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@ -0,0 +1,598 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef KFD_PRIV_H_INCLUDED
 #define KFD_PRIV_H_INCLUDED
 #include <linux/hashtable.h>
 #include <linux/mmu_notifier.h>
 #include <linux/mutex.h>
 #include <linux/types.h>
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 #include <linux/kfd_ioctl.h>
 #include <kgd_kfd_interface.h>
 #define KFD_SYSFS_FILE_MODE 0444
 /*
 * When working with cp scheduler we should assign the HIQ manually or via
 * the radeon driver to a fixed hqd slot, here are the fixed HIQ hqd slot
 * definitions for Kaveri. In Kaveri only the first ME queues participates
 * in the cp scheduling taking that in mind we set the HIQ slot in the
 * second ME.
 */
 #define KFD_CIK_HIQ_PIPE 4
 #define KFD_CIK_HIQ_QUEUE 0
 /* GPU ID hash width in bits */
 #define KFD_GPU_ID_HASH_WIDTH 16
 /* Macro for allocating structures */
 #define kfd_alloc_struct(ptr_to_struct)	\
 	((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
 /* Kernel module parameter to specify maximum number of supported processes */
 extern int max_num_of_processes;
 #define KFD_MAX_NUM_OF_PROCESSES_DEFAULT 32
 #define KFD_MAX_NUM_OF_PROCESSES 512
 /*
 * Kernel module parameter to specify maximum number of supported queues
 * per process
 */
 extern int max_num_of_queues_per_process;
 #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS_DEFAULT 128
 #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
 #define KFD_KERNEL_QUEUE_SIZE 2048
 /* Kernel module parameter to specify the scheduling policy */
 extern int sched_policy;
 /**
 * enum kfd_sched_policy
 *
 * @KFD_SCHED_POLICY_HWS: H/W scheduling policy known as command processor (cp)
 * scheduling. In this scheduling mode we're using the firmware code to
 * schedule the user mode queues and kernel queues such as HIQ and DIQ.
 * the HIQ queue is used as a special queue that dispatches the configuration
 * to the cp and the user mode queues list that are currently running.
 * the DIQ queue is a debugging queue that dispatches debugging commands to the
 * firmware.
 * in this scheduling mode user mode queues over subscription feature is
 * enabled.
 *
 * @KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: The same as above but the over
 * subscription feature disabled.
 *
 * @KFD_SCHED_POLICY_NO_HWS: no H/W scheduling policy is a mode which directly
 * set the command processor registers and sets the queues "manually". This
 * mode is used *ONLY* for debugging proposes.
 *
 */
 enum kfd_sched_policy {
 	KFD_SCHED_POLICY_HWS = 0,
 	KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION,
 	KFD_SCHED_POLICY_NO_HWS
 };
 enum cache_policy {
 	cache_policy_coherent,
 	cache_policy_noncoherent
 };
 struct kfd_device_info {
 	unsigned int max_pasid_bits;
 	size_t ih_ring_entry_size;
 	uint16_t mqd_size_aligned;
 };
 struct kfd_dev {
 	struct kgd_dev *kgd;
 	const struct kfd_device_info *device_info;
 	struct pci_dev *pdev;
 	unsigned int id;		/* topology stub index */
 	phys_addr_t doorbell_base;	/* Start of actual doorbells used by
 					 * KFD. It is aligned for mapping
 					 * into user mode
 					 */
 	size_t doorbell_id_offset;	/* Doorbell offset (from KFD doorbell
 					 * to HW doorbell, GFX reserved some
 					 * at the start)
 					 */
 	size_t doorbell_process_limit;	/* Number of processes we have doorbell
 					 * space for.
 					 */
 	u32 __iomem *doorbell_kernel_ptr; /* This is a pointer for a doorbells
 					   * page used by kernel queue
 					   */
 	struct kgd2kfd_shared_resources shared_resources;
 	void *interrupt_ring;
 	size_t interrupt_ring_size;
 	atomic_t interrupt_ring_rptr;
 	atomic_t interrupt_ring_wptr;
 	struct work_struct interrupt_work;
 	spinlock_t interrupt_lock;
 	/* QCM Device instance */
 	struct device_queue_manager *dqm;
 	bool init_complete;
 	/*
 	 * Interrupts of interest to KFD are copied
 	 * from the HW ring into a SW ring.
 	 */
 	bool interrupts_active;
 };
 /* KGD2KFD callbacks */
 void kgd2kfd_exit(void);
 struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev);
 bool kgd2kfd_device_init(struct kfd_dev *kfd,
 			 const struct kgd2kfd_shared_resources *gpu_resources);
 void kgd2kfd_device_exit(struct kfd_dev *kfd);
 extern const struct kfd2kgd_calls *kfd2kgd;
 struct kfd_mem_obj {
 	void *bo;
 	uint64_t gpu_addr;
 	uint32_t *cpu_ptr;
 };
 enum kfd_mempool {
 	KFD_MEMPOOL_SYSTEM_CACHEABLE = 1,
 	KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2,
 	KFD_MEMPOOL_FRAMEBUFFER = 3,
 };
 /* Character device interface */
 int kfd_chardev_init(void);
 void kfd_chardev_exit(void);
 struct device *kfd_chardev(void);
 /**
 * enum kfd_preempt_type_filter
 *
 * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue.
 *
 * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the
 *						running queues list.
 *
 * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to
 *						specific process.
 *
 */
 enum kfd_preempt_type_filter {
 	KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE,
 	KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES,
 	KFD_PREEMPT_TYPE_FILTER_BY_PASID
 };
 enum kfd_preempt_type {
 	KFD_PREEMPT_TYPE_WAVEFRONT,
 	KFD_PREEMPT_TYPE_WAVEFRONT_RESET
 };
 /**
 * enum kfd_queue_type
 *
 * @KFD_QUEUE_TYPE_COMPUTE: Regular user mode queue type.
 *
 * @KFD_QUEUE_TYPE_SDMA: Sdma user mode queue type.
 *
 * @KFD_QUEUE_TYPE_HIQ: HIQ queue type.
 *
 * @KFD_QUEUE_TYPE_DIQ: DIQ queue type.
 */
 enum kfd_queue_type  {
 	KFD_QUEUE_TYPE_COMPUTE,
 	KFD_QUEUE_TYPE_SDMA,
 	KFD_QUEUE_TYPE_HIQ,
 	KFD_QUEUE_TYPE_DIQ
 };
 enum kfd_queue_format {
 	KFD_QUEUE_FORMAT_PM4,
 	KFD_QUEUE_FORMAT_AQL
 };
 /**
 * struct queue_properties
 *
 * @type: The queue type.
 *
 * @queue_id: Queue identifier.
 *
 * @queue_address: Queue ring buffer address.
 *
 * @queue_size: Queue ring buffer size.
 *
 * @priority: Defines the queue priority relative to other queues in the
 * process.
 * This is just an indication and HW scheduling may override the priority as
 * necessary while keeping the relative prioritization.
 * the priority granularity is from 0 to f which f is the highest priority.
 * currently all queues are initialized with the highest priority.
 *
 * @queue_percent: This field is partially implemented and currently a zero in
 * this field defines that the queue is non active.
 *
 * @read_ptr: User space address which points to the number of dwords the
 * cp read from the ring buffer. This field updates automatically by the H/W.
 *
 * @write_ptr: Defines the number of dwords written to the ring buffer.
 *
 * @doorbell_ptr: This field aim is to notify the H/W of new packet written to
 * the queue ring buffer. This field should be similar to write_ptr and the user
 * should update this field after he updated the write_ptr.
 *
 * @doorbell_off: The doorbell offset in the doorbell pci-bar.
 *
 * @is_interop: Defines if this is a interop queue. Interop queue means that the
 * queue can access both graphics and compute resources.
 *
 * @is_active: Defines if the queue is active or not.
 *
 * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid
 * of the queue.
 *
 * This structure represents the queue properties for each queue no matter if
 * it's user mode or kernel mode queue.
 *
 */
 struct queue_properties {
 	enum kfd_queue_type type;
 	enum kfd_queue_format format;
 	unsigned int queue_id;
 	uint64_t queue_address;
 	uint64_t  queue_size;
 	uint32_t priority;
 	uint32_t queue_percent;
 	uint32_t *read_ptr;
 	uint32_t *write_ptr;
 	uint32_t *doorbell_ptr;
 	uint32_t doorbell_off;
 	bool is_interop;
 	bool is_active;
 	/* Not relevant for user mode queues in cp scheduling */
 	unsigned int vmid;
 };
 /**
 * struct queue
 *
 * @list: Queue linked list.
 *
 * @mqd: The queue MQD.
 *
 * @mqd_mem_obj: The MQD local gpu memory object.
 *
 * @gart_mqd_addr: The MQD gart mc address.
 *
 * @properties: The queue properties.
 *
 * @mec: Used only in no cp scheduling mode and identifies to micro engine id
 * that the queue should be execute on.
 *
 * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe id.
 *
 * @queue: Used only in no cp scheduliong mode and identifies the queue's slot.
 *
 * @process: The kfd process that created this queue.
 *
 * @device: The kfd device that created this queue.
 *
 * This structure represents user mode compute queues.
 * It contains all the necessary data to handle such queues.
 *
 */
 struct queue {
 	struct list_head list;
 	void *mqd;
 	struct kfd_mem_obj *mqd_mem_obj;
 	uint64_t gart_mqd_addr;
 	struct queue_properties properties;
 	uint32_t mec;
 	uint32_t pipe;
 	uint32_t queue;
 	struct kfd_process	*process;
 	struct kfd_dev		*device;
 };
 /*
 * Please read the kfd_mqd_manager.h description.
 */
 enum KFD_MQD_TYPE {
 	KFD_MQD_TYPE_CIK_COMPUTE = 0, /* for no cp scheduling */
 	KFD_MQD_TYPE_CIK_HIQ, /* for hiq */
 	KFD_MQD_TYPE_CIK_CP, /* for cp queues and diq */
 	KFD_MQD_TYPE_CIK_SDMA, /* for sdma queues */
 	KFD_MQD_TYPE_MAX
 };
 struct scheduling_resources {
 	unsigned int vmid_mask;
 	enum kfd_queue_type type;
 	uint64_t queue_mask;
 	uint64_t gws_mask;
 	uint32_t oac_mask;
 	uint32_t gds_heap_base;
 	uint32_t gds_heap_size;
 };
 struct process_queue_manager {
 	/* data */
 	struct kfd_process	*process;
 	unsigned int		num_concurrent_processes;
 	struct list_head	queues;
 	unsigned long		*queue_slot_bitmap;
 };
 struct qcm_process_device {
 	/* The Device Queue Manager that owns this data */
 	struct device_queue_manager *dqm;
 	struct process_queue_manager *pqm;
 	/* Device Queue Manager lock */
 	struct mutex *lock;
 	/* Queues list */
 	struct list_head queues_list;
 	struct list_head priv_queue_list;
 	unsigned int queue_count;
 	unsigned int vmid;
 	bool is_debug;
 	/*
 	 * All the memory management data should be here too
 	 */
 	uint64_t gds_context_area;
 	uint32_t sh_mem_config;
 	uint32_t sh_mem_bases;
 	uint32_t sh_mem_ape1_base;
 	uint32_t sh_mem_ape1_limit;
 	uint32_t page_table_base;
 	uint32_t gds_size;
 	uint32_t num_gws;
 	uint32_t num_oac;
 };
 /* Data that is per-process-per device. */
 struct kfd_process_device {
 	/*
 	 * List of all per-device data for a process.
 	 * Starts from kfd_process.per_device_data.
 	 */
 	struct list_head per_device_list;
 	/* The device that owns this data. */
 	struct kfd_dev *dev;
 	/* per-process-per device QCM data structure */
 	struct qcm_process_device qpd;
 	/*Apertures*/
 	uint64_t lds_base;
 	uint64_t lds_limit;
 	uint64_t gpuvm_base;
 	uint64_t gpuvm_limit;
 	uint64_t scratch_base;
 	uint64_t scratch_limit;
 	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
 	bool bound;
 };
 /* Process data */
 struct kfd_process {
 	/*
 	 * kfd_process are stored in an mm_struct*->kfd_process*
 	 * hash table (kfd_processes in kfd_process.c)
 	 */
 	struct hlist_node kfd_processes;
 	struct mm_struct *mm;
 	struct mutex mutex;
 	/*
 	 * In any process, the thread that started main() is the lead
 	 * thread and outlives the rest.
 	 * It is here because amd_iommu_bind_pasid wants a task_struct.
 	 */
 	struct task_struct *lead_thread;
 	/* We want to receive a notification when the mm_struct is destroyed */
 	struct mmu_notifier mmu_notifier;
 	/* Use for delayed freeing of kfd_process structure */
 	struct rcu_head	rcu;
 	unsigned int pasid;
 	/*
 	 * List of kfd_process_device structures,
 	 * one for each device the process is using.
 	 */
 	struct list_head per_device_data;
 	struct process_queue_manager pqm;
 	/* The process's queues. */
 	size_t queue_array_size;
 	/* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */
 	struct kfd_queue **queues;
 	unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)];
 	/*Is the user space process 32 bit?*/
 	bool is_32bit_user_mode;
 };
 void kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
 struct kfd_process *kfd_create_process(const struct task_struct *);
 struct kfd_process *kfd_get_process(const struct task_struct *);
 struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
 							struct kfd_process *p);
 void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
 struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 							struct kfd_process *p,
 							int create_pdd);
 /* Process device data iterator */
 struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p);
 struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p,
 						struct kfd_process_device *pdd);
 bool kfd_has_process_device_data(struct kfd_process *p);
 /* PASIDs */
 int kfd_pasid_init(void);
 void kfd_pasid_exit(void);
 bool kfd_set_pasid_limit(unsigned int new_limit);
 unsigned int kfd_get_pasid_limit(void);
 unsigned int kfd_pasid_alloc(void);
 void kfd_pasid_free(unsigned int pasid);
 /* Doorbells */
 void kfd_doorbell_init(struct kfd_dev *kfd);
 int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma);
 u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
 					unsigned int *doorbell_off);
 void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr);
 u32 read_kernel_doorbell(u32 __iomem *db);
 void write_kernel_doorbell(u32 __iomem *db, u32 value);
 unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
 					struct kfd_process *process,
 					unsigned int queue_id);
 extern struct device *kfd_device;
 /* Topology */
 int kfd_topology_init(void);
 void kfd_topology_shutdown(void);
 int kfd_topology_add_device(struct kfd_dev *gpu);
 int kfd_topology_remove_device(struct kfd_dev *gpu);
 struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
 struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx);
 /* Interrupts */
 int kfd_interrupt_init(struct kfd_dev *dev);
 void kfd_interrupt_exit(struct kfd_dev *dev);
 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
 bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry);
 /* Power Management */
 void kgd2kfd_suspend(struct kfd_dev *kfd);
 int kgd2kfd_resume(struct kfd_dev *kfd);
 /* amdkfd Apertures */
 int kfd_init_apertures(struct kfd_process *process);
 /* Queue Context Management */
 inline uint32_t lower_32(uint64_t x);
 inline uint32_t upper_32(uint64_t x);
 int init_queue(struct queue **q, struct queue_properties properties);
 void uninit_queue(struct queue *q);
 void print_queue_properties(struct queue_properties *q);
 void print_queue(struct queue *q);
 struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
 					struct kfd_dev *dev);
 struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
 void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 					enum kfd_queue_type type);
 void kernel_queue_uninit(struct kernel_queue *kq);
 /* Process Queue Manager */
 struct process_queue_node {
 	struct queue *q;
 	struct kernel_queue *kq;
 	struct list_head process_queue_list;
 };
 int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p);
 void pqm_uninit(struct process_queue_manager *pqm);
 int pqm_create_queue(struct process_queue_manager *pqm,
 			    struct kfd_dev *dev,
 			    struct file *f,
 			    struct queue_properties *properties,
 			    unsigned int flags,
 			    enum kfd_queue_type type,
 			    unsigned int *qid);
 int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid);
 int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
 			struct queue_properties *p);
 /* Packet Manager */
 #define KFD_HIQ_TIMEOUT (500)
 #define KFD_FENCE_COMPLETED (100)
 #define KFD_FENCE_INIT   (10)
 #define KFD_UNMAP_LATENCY (150)
 struct packet_manager {
 	struct device_queue_manager *dqm;
 	struct kernel_queue *priv_queue;
 	struct mutex lock;
 	bool allocated;
 	struct kfd_mem_obj *ib_buffer_obj;
 };
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
 void pm_uninit(struct packet_manager *pm);
 int pm_send_set_resources(struct packet_manager *pm,
 				struct scheduling_resources *res);
 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
 int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
 				uint32_t fence_value);
 int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
 			enum kfd_preempt_type_filter mode,
 			uint32_t filter_param, bool reset,
 			unsigned int sdma_engine);
 void pm_release_ib(struct packet_manager *pm);
 uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
 phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
 					struct kfd_process *process);
 #endif
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@ -0,0 +1,415 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <linux/mutex.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/amd-iommu.h>
 #include <linux/notifier.h>
 struct mm_struct;
 #include "kfd_priv.h"
 /*
 * Initial size for the array of queues.
 * The allocated size is doubled each time
 * it is exceeded up to MAX_PROCESS_QUEUES.
 */
 #define INITIAL_QUEUE_ARRAY_SIZE 16
 /*
 * List of struct kfd_process (field kfd_process).
 * Unique/indexed by mm_struct*
 */
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
 static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
 static DEFINE_MUTEX(kfd_processes_mutex);
 DEFINE_STATIC_SRCU(kfd_processes_srcu);
 static struct workqueue_struct *kfd_process_wq;
 struct kfd_process_release_work {
 	struct work_struct kfd_work;
 	struct kfd_process *p;
 };
 static struct kfd_process *find_process(const struct task_struct *thread);
 static struct kfd_process *create_process(const struct task_struct *thread);
 void kfd_process_create_wq(void)
 {
 	if (!kfd_process_wq)
 		kfd_process_wq = create_workqueue("kfd_process_wq");
 }
 void kfd_process_destroy_wq(void)
 {
 	if (kfd_process_wq) {
 		flush_workqueue(kfd_process_wq);
 		destroy_workqueue(kfd_process_wq);
 		kfd_process_wq = NULL;
 	}
 }
 struct kfd_process *kfd_create_process(const struct task_struct *thread)
 {
 	struct kfd_process *process;
 	BUG_ON(!kfd_process_wq);
 	if (thread->mm == NULL)
 		return ERR_PTR(-EINVAL);
 	/* Only the pthreads threading model is supported. */
 	if (thread->group_leader->mm != thread->mm)
 		return ERR_PTR(-EINVAL);
 	/* Take mmap_sem because we call __mmu_notifier_register inside */
 	down_write(&thread->mm->mmap_sem);
 	/*
 	 * take kfd processes mutex before starting of process creation
 	 * so there won't be a case where two threads of the same process
 	 * create two kfd_process structures
 	 */
 	mutex_lock(&kfd_processes_mutex);
 	/* A prior open of /dev/kfd could have already created the process. */
 	process = find_process(thread);
 	if (process)
 		pr_debug("kfd: process already found\n");
 	if (!process)
 		process = create_process(thread);
 	mutex_unlock(&kfd_processes_mutex);
 	up_write(&thread->mm->mmap_sem);
 	return process;
 }
 struct kfd_process *kfd_get_process(const struct task_struct *thread)
 {
 	struct kfd_process *process;
 	if (thread->mm == NULL)
 		return ERR_PTR(-EINVAL);
 	/* Only the pthreads threading model is supported. */
 	if (thread->group_leader->mm != thread->mm)
 		return ERR_PTR(-EINVAL);
 	process = find_process(thread);
 	return process;
 }
 static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
 {
 	struct kfd_process *process;
 	hash_for_each_possible_rcu(kfd_processes_table, process,
 					kfd_processes, (uintptr_t)mm)
 		if (process->mm == mm)
 			return process;
 	return NULL;
 }
 static struct kfd_process *find_process(const struct task_struct *thread)
 {
 	struct kfd_process *p;
 	int idx;
 	idx = srcu_read_lock(&kfd_processes_srcu);
 	p = find_process_by_mm(thread->mm);
 	srcu_read_unlock(&kfd_processes_srcu, idx);
 	return p;
 }
 static void kfd_process_wq_release(struct work_struct *work)
 {
 	struct kfd_process_release_work *my_work;
 	struct kfd_process_device *pdd, *temp;
 	struct kfd_process *p;
 	my_work = (struct kfd_process_release_work *) work;
 	p = my_work->p;
 	mutex_lock(&p->mutex);
 	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
 							per_device_list) {
 		amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
 		list_del(&pdd->per_device_list);
 		kfree(pdd);
 	}
 	kfd_pasid_free(p->pasid);
 	mutex_unlock(&p->mutex);
 	mutex_destroy(&p->mutex);
 	kfree(p->queues);
 	kfree(p);
 	kfree((void *)work);
 }
 static void kfd_process_destroy_delayed(struct rcu_head *rcu)
 {
 	struct kfd_process_release_work *work;
 	struct kfd_process *p;
 	BUG_ON(!kfd_process_wq);
 	p = container_of(rcu, struct kfd_process, rcu);
 	BUG_ON(atomic_read(&p->mm->mm_count) <= 0);
 	mmdrop(p->mm);
 	work = (struct kfd_process_release_work *)
 		kmalloc(sizeof(struct kfd_process_release_work), GFP_KERNEL);
 	if (work) {
 		INIT_WORK((struct work_struct *) work, kfd_process_wq_release);
 		work->p = p;
 		queue_work(kfd_process_wq, (struct work_struct *) work);
 	}
 }
 static void kfd_process_notifier_release(struct mmu_notifier *mn,
 					struct mm_struct *mm)
 {
 	struct kfd_process *p;
 	/*
 	 * The kfd_process structure can not be free because the
 	 * mmu_notifier srcu is read locked
 	 */
 	p = container_of(mn, struct kfd_process, mmu_notifier);
 	BUG_ON(p->mm != mm);
 	mutex_lock(&kfd_processes_mutex);
 	hash_del_rcu(&p->kfd_processes);
 	mutex_unlock(&kfd_processes_mutex);
 	synchronize_srcu(&kfd_processes_srcu);
 	mutex_lock(&p->mutex);
 	/* In case our notifier is called before IOMMU notifier */
 	pqm_uninit(&p->pqm);
 	mutex_unlock(&p->mutex);
 	/*
 	 * Because we drop mm_count inside kfd_process_destroy_delayed
 	 * and because the mmu_notifier_unregister function also drop
 	 * mm_count we need to take an extra count here.
 	 */
 	atomic_inc(&p->mm->mm_count);
 	mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm);
 	mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
 }
 static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 	.release = kfd_process_notifier_release,
 };
 static struct kfd_process *create_process(const struct task_struct *thread)
 {
 	struct kfd_process *process;
 	int err = -ENOMEM;
 	process = kzalloc(sizeof(*process), GFP_KERNEL);
 	if (!process)
 		goto err_alloc_process;
 	process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE,
 					sizeof(process->queues[0]), GFP_KERNEL);
 	if (!process->queues)
 		goto err_alloc_queues;
 	process->pasid = kfd_pasid_alloc();
 	if (process->pasid == 0)
 		goto err_alloc_pasid;
 	mutex_init(&process->mutex);
 	process->mm = thread->mm;
 	/* register notifier */
 	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
 	err = __mmu_notifier_register(&process->mmu_notifier, process->mm);
 	if (err)
 		goto err_mmu_notifier;
 	hash_add_rcu(kfd_processes_table, &process->kfd_processes,
 			(uintptr_t)process->mm);
 	process->lead_thread = thread->group_leader;
 	process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE;
 	INIT_LIST_HEAD(&process->per_device_data);
 	err = pqm_init(&process->pqm, process);
 	if (err != 0)
 		goto err_process_pqm_init;
 	return process;
 err_process_pqm_init:
 	hash_del_rcu(&process->kfd_processes);
 	synchronize_rcu();
 	mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
 err_mmu_notifier:
 	kfd_pasid_free(process->pasid);
 err_alloc_pasid:
 	kfree(process->queues);
 err_alloc_queues:
 	kfree(process);
 err_alloc_process:
 	return ERR_PTR(err);
 }
 struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 							struct kfd_process *p,
 							int create_pdd)
 {
 	struct kfd_process_device *pdd = NULL;
 	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
 		if (pdd->dev == dev)
 			return pdd;
 	if (create_pdd) {
 		pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
 		if (pdd != NULL) {
 			pdd->dev = dev;
 			INIT_LIST_HEAD(&pdd->qpd.queues_list);
 			INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
 			pdd->qpd.dqm = dev->dqm;
 			list_add(&pdd->per_device_list, &p->per_device_data);
 		}
 	}
 	return pdd;
 }
 /*
 * Direct the IOMMU to bind the process (specifically the pasid->mm)
 * to the device.
 * Unbinding occurs when the process dies or the device is removed.
 *
 * Assumes that the process lock is held.
 */
 struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
 							struct kfd_process *p)
 {
 	struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p, 1);
 	int err;
 	if (pdd == NULL)
 		return ERR_PTR(-ENOMEM);
 	if (pdd->bound)
 		return pdd;
 	err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
 	if (err < 0)
 		return ERR_PTR(err);
 	if (err < 0) {
 		amd_iommu_unbind_pasid(dev->pdev, p->pasid);
 		return ERR_PTR(err);
 	}
 	pdd->bound = true;
 	return pdd;
 }
 void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
 {
 	struct kfd_process *p;
 	struct kfd_process_device *pdd;
 	int idx, i;
 	BUG_ON(dev == NULL);
 	idx = srcu_read_lock(&kfd_processes_srcu);
 	hash_for_each_rcu(kfd_processes_table, i, p, kfd_processes)
 		if (p->pasid == pasid)
 			break;
 	srcu_read_unlock(&kfd_processes_srcu, idx);
 	BUG_ON(p->pasid != pasid);
 	mutex_lock(&p->mutex);
 	pqm_uninit(&p->pqm);
 	pdd = kfd_get_process_device_data(dev, p, 0);
 	/*
 	 * Just mark pdd as unbound, because we still need it to call
 	 * amd_iommu_unbind_pasid() in when the process exits.
 	 * We don't call amd_iommu_unbind_pasid() here
 	 * because the IOMMU called us.
 	 */
 	if (pdd)
 		pdd->bound = false;
 	mutex_unlock(&p->mutex);
 }
 struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p)
 {
 	return list_first_entry(&p->per_device_data,
 				struct kfd_process_device,
 				per_device_list);
 }
 struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p,
 						struct kfd_process_device *pdd)
 {
 	if (list_is_last(&pdd->per_device_list, &p->per_device_data))
 		return NULL;
 	return list_next_entry(pdd, per_device_list);
 }
 bool kfd_has_process_device_data(struct kfd_process *p)
 {
 	return !(list_empty(&p->per_device_data));
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@ -0,0 +1,342 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #include <linux/slab.h>
 #include <linux/list.h>
 #include "kfd_device_queue_manager.h"
 #include "kfd_priv.h"
 #include "kfd_kernel_queue.h"
 static inline struct process_queue_node *get_queue_by_qid(
 			struct process_queue_manager *pqm, unsigned int qid)
 {
 	struct process_queue_node *pqn;
 	BUG_ON(!pqm);
 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
 		if (pqn->q && pqn->q->properties.queue_id == qid)
 			return pqn;
 		if (pqn->kq && pqn->kq->queue->properties.queue_id == qid)
 			return pqn;
 	}
 	return NULL;
 }
 static int find_available_queue_slot(struct process_queue_manager *pqm,
 					unsigned int *qid)
 {
 	unsigned long found;
 	BUG_ON(!pqm || !qid);
 	pr_debug("kfd: in %s\n", __func__);
 	found = find_first_zero_bit(pqm->queue_slot_bitmap,
 			max_num_of_queues_per_process);
 	pr_debug("kfd: the new slot id %lu\n", found);
 	if (found >= max_num_of_queues_per_process) {
 		pr_info("amdkfd: Can not open more queues for process with pasid %d\n",
 				pqm->process->pasid);
 		return -ENOMEM;
 	}
 	set_bit(found, pqm->queue_slot_bitmap);
 	*qid = found;
 	return 0;
 }
 int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p)
 {
 	BUG_ON(!pqm);
 	INIT_LIST_HEAD(&pqm->queues);
 	pqm->queue_slot_bitmap =
 			kzalloc(DIV_ROUND_UP(max_num_of_queues_per_process,
 					BITS_PER_BYTE), GFP_KERNEL);
 	if (pqm->queue_slot_bitmap == NULL)
 		return -ENOMEM;
 	pqm->process = p;
 	return 0;
 }
 void pqm_uninit(struct process_queue_manager *pqm)
 {
 	int retval;
 	struct process_queue_node *pqn, *next;
 	BUG_ON(!pqm);
 	pr_debug("In func %s\n", __func__);
 	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
 		retval = pqm_destroy_queue(
 				pqm,
 				(pqn->q != NULL) ?
 					pqn->q->properties.queue_id :
 					pqn->kq->queue->properties.queue_id);
 		if (retval != 0) {
 			pr_err("kfd: failed to destroy queue\n");
 			return;
 		}
 	}
 	kfree(pqm->queue_slot_bitmap);
 	pqm->queue_slot_bitmap = NULL;
 }
 static int create_cp_queue(struct process_queue_manager *pqm,
 				struct kfd_dev *dev, struct queue **q,
 				struct queue_properties *q_properties,
 				struct file *f, unsigned int qid)
 {
 	int retval;
 	retval = 0;
 	/* Doorbell initialized in user space*/
 	q_properties->doorbell_ptr = NULL;
 	q_properties->doorbell_off =
 			kfd_queue_id_to_doorbell(dev, pqm->process, qid);
 	/* let DQM handle it*/
 	q_properties->vmid = 0;
 	q_properties->queue_id = qid;
 	q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
 	retval = init_queue(q, *q_properties);
 	if (retval != 0)
 		goto err_init_queue;
 	(*q)->device = dev;
 	(*q)->process = pqm->process;
 	pr_debug("kfd: PQM After init queue");
 	return retval;
 err_init_queue:
 	return retval;
 }
 int pqm_create_queue(struct process_queue_manager *pqm,
 			    struct kfd_dev *dev,
 			    struct file *f,
 			    struct queue_properties *properties,
 			    unsigned int flags,
 			    enum kfd_queue_type type,
 			    unsigned int *qid)
 {
 	int retval;
 	struct kfd_process_device *pdd;
 	struct queue_properties q_properties;
 	struct queue *q;
 	struct process_queue_node *pqn;
 	struct kernel_queue *kq;
 	BUG_ON(!pqm || !dev || !properties || !qid);
 	memset(&q_properties, 0, sizeof(struct queue_properties));
 	memcpy(&q_properties, properties, sizeof(struct queue_properties));
 	q = NULL;
 	kq = NULL;
 	pdd = kfd_get_process_device_data(dev, pqm->process, 1);
 	BUG_ON(!pdd);
 	retval = find_available_queue_slot(pqm, qid);
 	if (retval != 0)
 		return retval;
 	if (list_empty(&pqm->queues)) {
 		pdd->qpd.pqm = pqm;
 		dev->dqm->register_process(dev->dqm, &pdd->qpd);
 	}
 	pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL);
 	if (!pqn) {
 		retval = -ENOMEM;
 		goto err_allocate_pqn;
 	}
 	switch (type) {
 	case KFD_QUEUE_TYPE_COMPUTE:
 		/* check if there is over subscription */
 		if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
 		((dev->dqm->processes_count >= VMID_PER_DEVICE) ||
 		(dev->dqm->queue_count >= PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE))) {
 			pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
 			retval = -EPERM;
 			goto err_create_queue;
 		}
 		retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid);
 		if (retval != 0)
 			goto err_create_queue;
 		pqn->q = q;
 		pqn->kq = NULL;
 		retval = dev->dqm->create_queue(dev->dqm, q, &pdd->qpd,
 						&q->properties.vmid);
 		print_queue(q);
 		break;
 	case KFD_QUEUE_TYPE_DIQ:
 		kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ);
 		if (kq == NULL) {
 			kernel_queue_uninit(kq);
 			goto err_create_queue;
 		}
 		kq->queue->properties.queue_id = *qid;
 		pqn->kq = kq;
 		pqn->q = NULL;
 		retval = dev->dqm->create_kernel_queue(dev->dqm, kq, &pdd->qpd);
 		break;
 	default:
 		BUG();
 		break;
 	}
 	if (retval != 0) {
 		pr_err("kfd: error dqm create queue\n");
 		goto err_create_queue;
 	}
 	pr_debug("kfd: PQM After DQM create queue\n");
 	list_add(&pqn->process_queue_list, &pqm->queues);
 	if (q) {
 		*properties = q->properties;
 		pr_debug("kfd: PQM done creating queue\n");
 		print_queue_properties(properties);
 	}
 	return retval;
 err_create_queue:
 	kfree(pqn);
 err_allocate_pqn:
 	clear_bit(*qid, pqm->queue_slot_bitmap);
 	return retval;
 }
 int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 {
 	struct process_queue_node *pqn;
 	struct kfd_process_device *pdd;
 	struct device_queue_manager *dqm;
 	struct kfd_dev *dev;
 	int retval;
 	dqm = NULL;
 	BUG_ON(!pqm);
 	retval = 0;
 	pr_debug("kfd: In Func %s\n", __func__);
 	pqn = get_queue_by_qid(pqm, qid);
 	if (pqn == NULL) {
 		pr_err("kfd: queue id does not match any known queue\n");
 		return -EINVAL;
 	}
 	dev = NULL;
 	if (pqn->kq)
 		dev = pqn->kq->dev;
 	if (pqn->q)
 		dev = pqn->q->device;
 	BUG_ON(!dev);
 	pdd = kfd_get_process_device_data(dev, pqm->process, 1);
 	BUG_ON(!pdd);
 	if (pqn->kq) {
 		/* destroy kernel queue (DIQ) */
 		dqm = pqn->kq->dev->dqm;
 		dqm->destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
 		kernel_queue_uninit(pqn->kq);
 	}
 	if (pqn->q) {
 		dqm = pqn->q->device->dqm;
 		retval = dqm->destroy_queue(dqm, &pdd->qpd, pqn->q);
 		if (retval != 0)
 			return retval;
 		uninit_queue(pqn->q);
 	}
 	list_del(&pqn->process_queue_list);
 	kfree(pqn);
 	clear_bit(qid, pqm->queue_slot_bitmap);
 	if (list_empty(&pqm->queues))
 		dqm->unregister_process(dqm, &pdd->qpd);
 	return retval;
 }
 int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
 			struct queue_properties *p)
 {
 	int retval;
 	struct process_queue_node *pqn;
 	BUG_ON(!pqm);
 	pqn = get_queue_by_qid(pqm, qid);
 	BUG_ON(!pqn);
 	pqn->q->properties.queue_address = p->queue_address;
 	pqn->q->properties.queue_size = p->queue_size;
 	pqn->q->properties.queue_percent = p->queue_percent;
 	pqn->q->properties.priority = p->priority;
 	retval = pqn->q->device->dqm->update_queue(pqn->q->device->dqm, pqn->q);
 	if (retval != 0)
 		return retval;
 	return 0;
 }
 struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
 					unsigned int qid)
 {
 	struct process_queue_node *pqn;
 	BUG_ON(!pqm);
 	pqn = get_queue_by_qid(pqm, qid);
 	if (pqn && pqn->kq)
 		return pqn->kq;
 	return NULL;
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@ -0,0 +1,85 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */
 #include <linux/slab.h>
 #include "kfd_priv.h"
 void print_queue_properties(struct queue_properties *q)
 {
 	if (!q)
 		return;
 	pr_debug("Printing queue properties:\n");
 	pr_debug("Queue Type: %u\n", q->type);
 	pr_debug("Queue Size: %llu\n", q->queue_size);
 	pr_debug("Queue percent: %u\n", q->queue_percent);
 	pr_debug("Queue Address: 0x%llX\n", q->queue_address);
 	pr_debug("Queue Id: %u\n", q->queue_id);
 	pr_debug("Queue Process Vmid: %u\n", q->vmid);
 	pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr);
 	pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr);
 	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr);
 	pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off);
 }
 void print_queue(struct queue *q)
 {
 	if (!q)
 		return;
 	pr_debug("Printing queue:\n");
 	pr_debug("Queue Type: %u\n", q->properties.type);
 	pr_debug("Queue Size: %llu\n", q->properties.queue_size);
 	pr_debug("Queue percent: %u\n", q->properties.queue_percent);
 	pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address);
 	pr_debug("Queue Id: %u\n", q->properties.queue_id);
 	pr_debug("Queue Process Vmid: %u\n", q->properties.vmid);
 	pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr);
 	pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr);
 	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr);
 	pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off);
 	pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
 	pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr);
 	pr_debug("Queue Process Address: 0x%p\n", q->process);
 	pr_debug("Queue Device Address: 0x%p\n", q->device);
 }
 int init_queue(struct queue **q, struct queue_properties properties)
 {
 	struct queue *tmp;
 	BUG_ON(!q);
 	tmp = kzalloc(sizeof(struct queue), GFP_KERNEL);
 	if (!tmp)
 		return -ENOMEM;
 	memcpy(&tmp->properties, &properties, sizeof(struct queue_properties));
 	*q = tmp;
 	return 0;
 }
 void uninit_queue(struct queue *q)
 {
 	kfree(q);
 }
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@ -0,0 +1,168 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef __KFD_TOPOLOGY_H__
 #define __KFD_TOPOLOGY_H__
 #include <linux/types.h>
 #include <linux/list.h>
 #include "kfd_priv.h"
 #define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 128
 #define HSA_CAP_HOT_PLUGGABLE			0x00000001
 #define HSA_CAP_ATS_PRESENT			0x00000002
 #define HSA_CAP_SHARED_WITH_GRAPHICS		0x00000004
 #define HSA_CAP_QUEUE_SIZE_POW2			0x00000008
 #define HSA_CAP_QUEUE_SIZE_32BIT		0x00000010
 #define HSA_CAP_QUEUE_IDLE_EVENT		0x00000020
 #define HSA_CAP_VA_LIMIT			0x00000040
 #define HSA_CAP_WATCH_POINTS_SUPPORTED		0x00000080
 #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
 #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
 #define HSA_CAP_RESERVED			0xfffff000
 struct kfd_node_properties {
 	uint32_t cpu_cores_count;
 	uint32_t simd_count;
 	uint32_t mem_banks_count;
 	uint32_t caches_count;
 	uint32_t io_links_count;
 	uint32_t cpu_core_id_base;
 	uint32_t simd_id_base;
 	uint32_t capability;
 	uint32_t max_waves_per_simd;
 	uint32_t lds_size_in_kb;
 	uint32_t gds_size_in_kb;
 	uint32_t wave_front_size;
 	uint32_t array_count;
 	uint32_t simd_arrays_per_engine;
 	uint32_t cu_per_simd_array;
 	uint32_t simd_per_cu;
 	uint32_t max_slots_scratch_cu;
 	uint32_t engine_id;
 	uint32_t vendor_id;
 	uint32_t device_id;
 	uint32_t location_id;
 	uint32_t max_engine_clk_fcompute;
 	uint32_t max_engine_clk_ccompute;
 	uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
 };
 #define HSA_MEM_HEAP_TYPE_SYSTEM	0
 #define HSA_MEM_HEAP_TYPE_FB_PUBLIC	1
 #define HSA_MEM_HEAP_TYPE_FB_PRIVATE	2
 #define HSA_MEM_HEAP_TYPE_GPU_GDS	3
 #define HSA_MEM_HEAP_TYPE_GPU_LDS	4
 #define HSA_MEM_HEAP_TYPE_GPU_SCRATCH	5
 #define HSA_MEM_FLAGS_HOT_PLUGGABLE	0x00000001
 #define HSA_MEM_FLAGS_NON_VOLATILE	0x00000002
 #define HSA_MEM_FLAGS_RESERVED		0xfffffffc
 struct kfd_mem_properties {
 	struct list_head	list;
 	uint32_t		heap_type;
 	uint64_t		size_in_bytes;
 	uint32_t		flags;
 	uint32_t		width;
 	uint32_t		mem_clk_max;
 	struct kobject		*kobj;
 	struct attribute	attr;
 };
 #define KFD_TOPOLOGY_CPU_SIBLINGS 256
 #define HSA_CACHE_TYPE_DATA		0x00000001
 #define HSA_CACHE_TYPE_INSTRUCTION	0x00000002
 #define HSA_CACHE_TYPE_CPU		0x00000004
 #define HSA_CACHE_TYPE_HSACU		0x00000008
 #define HSA_CACHE_TYPE_RESERVED		0xfffffff0
 struct kfd_cache_properties {
 	struct list_head	list;
 	uint32_t		processor_id_low;
 	uint32_t		cache_level;
 	uint32_t		cache_size;
 	uint32_t		cacheline_size;
 	uint32_t		cachelines_per_tag;
 	uint32_t		cache_assoc;
 	uint32_t		cache_latency;
 	uint32_t		cache_type;
 	uint8_t			sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS];
 	struct kobject		*kobj;
 	struct attribute	attr;
 };
 struct kfd_iolink_properties {
 	struct list_head	list;
 	uint32_t		iolink_type;
 	uint32_t		ver_maj;
 	uint32_t		ver_min;
 	uint32_t		node_from;
 	uint32_t		node_to;
 	uint32_t		weight;
 	uint32_t		min_latency;
 	uint32_t		max_latency;
 	uint32_t		min_bandwidth;
 	uint32_t		max_bandwidth;
 	uint32_t		rec_transfer_size;
 	uint32_t		flags;
 	struct kobject		*kobj;
 	struct attribute	attr;
 };
 struct kfd_topology_device {
 	struct list_head		list;
 	uint32_t			gpu_id;
 	struct kfd_node_properties	node_props;
 	uint32_t			mem_bank_count;
 	struct list_head		mem_props;
 	uint32_t			cache_count;
 	struct list_head		cache_props;
 	uint32_t			io_link_count;
 	struct list_head		io_link_props;
 	struct kfd_dev			*gpu;
 	struct kobject			*kobj_node;
 	struct kobject			*kobj_mem;
 	struct kobject			*kobj_cache;
 	struct kobject			*kobj_iolink;
 	struct attribute		attr_gpuid;
 	struct attribute		attr_name;
 	struct attribute		attr_props;
 };
 struct kfd_system_properties {
 	uint32_t		num_devices;     /* Number of H-NUMA nodes */
 	uint32_t		generation_count;
 	uint64_t		platform_oem;
 	uint64_t		platform_id;
 	uint64_t		platform_rev;
 	struct kobject		*kobj_topology;
 	struct kobject		*kobj_nodes;
 	struct attribute	attr_genid;
 	struct attribute	attr_props;
 };
 #endif /* __KFD_TOPOLOGY_H__ */
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@ -0,0 +1,185 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 /*
 * This file defines the private interface between the
 * AMD kernel graphics drivers and the AMD KFD.
 */
 #ifndef KGD_KFD_INTERFACE_H_INCLUDED
 #define KGD_KFD_INTERFACE_H_INCLUDED
 #include <linux/types.h>
 struct pci_dev;
 #define KFD_INTERFACE_VERSION 1
 struct kfd_dev;
 struct kgd_dev;
 struct kgd_mem;
 enum kgd_memory_pool {
 	KGD_POOL_SYSTEM_CACHEABLE = 1,
 	KGD_POOL_SYSTEM_WRITECOMBINE = 2,
 	KGD_POOL_FRAMEBUFFER = 3,
 };
 struct kgd2kfd_shared_resources {
 	/* Bit n == 1 means VMID n is available for KFD. */
 	unsigned int compute_vmid_bitmap;
 	/* Compute pipes are counted starting from MEC0/pipe0 as 0. */
 	unsigned int first_compute_pipe;
 	/* Number of MEC pipes available for KFD. */
 	unsigned int compute_pipe_count;
 	/* Base address of doorbell aperture. */
 	phys_addr_t doorbell_physical_address;
 	/* Size in bytes of doorbell aperture. */
 	size_t doorbell_aperture_size;
 	/* Number of bytes at start of aperture reserved for KGD. */
 	size_t doorbell_start_offset;
 };
 /**
 * struct kgd2kfd_calls
 *
 * @exit: Notifies amdkfd that kgd module is unloaded
 *
 * @probe: Notifies amdkfd about a probe done on a device in the kgd driver.
 *
 * @device_init: Initialize the newly probed device (if it is a device that
 * amdkfd supports)
 *
 * @device_exit: Notifies amdkfd about a removal of a kgd device
 *
 * @suspend: Notifies amdkfd about a suspend action done to a kgd device
 *
 * @resume: Notifies amdkfd about a resume action done to a kgd device
 *
 * This structure contains function callback pointers so the kgd driver
 * will notify to the amdkfd about certain status changes.
 *
 */
 struct kgd2kfd_calls {
 	void (*exit)(void);
 	struct kfd_dev* (*probe)(struct kgd_dev *kgd, struct pci_dev *pdev);
 	bool (*device_init)(struct kfd_dev *kfd,
 			const struct kgd2kfd_shared_resources *gpu_resources);
 	void (*device_exit)(struct kfd_dev *kfd);
 	void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry);
 	void (*suspend)(struct kfd_dev *kfd);
 	int (*resume)(struct kfd_dev *kfd);
 };
 /**
 * struct kfd2kgd_calls
 *
 * @init_sa_manager: Initialize an instance of the sa manager, used by
 * amdkfd for all system memory allocations that are mapped to the GART
 * address space
 *
 * @fini_sa_manager: Releases all memory allocations for amdkfd that are
 * handled by kgd sa manager
 *
 * @allocate_mem: Allocate a buffer from amdkfd's sa manager. The buffer can
 * be used for mqds, hpds, kernel queue, fence and runlists
 *
 * @free_mem: Frees a buffer that was allocated by amdkfd's sa manager
 *
 * @get_vmem_size: Retrieves (physical) size of VRAM
 *
 * @get_gpu_clock_counter: Retrieves GPU clock counter
 *
 * @get_max_engine_clock_in_mhz: Retrieves maximum GPU clock in MHz
 *
 * @program_sh_mem_settings: A function that should initiate the memory
 * properties such as main aperture memory type (cache / non cached) and
 * secondary aperture base address, size and memory type.
 * This function is used only for no cp scheduling mode.
 *
 * @set_pasid_vmid_mapping: Exposes pasid/vmid pair to the H/W for no cp
 * scheduling mode. Only used for no cp scheduling mode.
 *
 * @init_memory: Initializes memory apertures to fixed base/limit address
 * and non cached memory types.
 *
 * @init_pipeline: Initialized the compute pipelines.
 *
 * @hqd_load: Loads the mqd structure to a H/W hqd slot. used only for no cp
 * sceduling mode.
 *
 * @hqd_is_occupies: Checks if a hqd slot is occupied.
 *
 * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot.
 *
 * This structure contains function pointers to services that the kgd driver
 * provides to amdkfd driver.
 *
 */
 struct kfd2kgd_calls {
 	/* Memory management. */
 	int (*init_sa_manager)(struct kgd_dev *kgd, unsigned int size);
 	void (*fini_sa_manager)(struct kgd_dev *kgd);
 	int (*allocate_mem)(struct kgd_dev *kgd, size_t size, size_t alignment,
 			enum kgd_memory_pool pool, struct kgd_mem **mem);
 	void (*free_mem)(struct kgd_dev *kgd, struct kgd_mem *mem);
 	uint64_t (*get_vmem_size)(struct kgd_dev *kgd);
 	uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd);
 	uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd);
 	/* Register access functions */
 	void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid,
 			uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
 			uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
 	int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, unsigned int pasid,
 					unsigned int vmid);
 	int (*init_memory)(struct kgd_dev *kgd);
 	int (*init_pipeline)(struct kgd_dev *kgd, uint32_t pipe_id,
 				uint32_t hpd_size, uint64_t hpd_gpu_addr);
 	int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 			uint32_t queue_id, uint32_t __user *wptr);
 	bool (*hqd_is_occupies)(struct kgd_dev *kgd, uint64_t queue_address,
 				uint32_t pipe_id, uint32_t queue_id);
 	int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type,
 				unsigned int timeout, uint32_t pipe_id,
 				uint32_t queue_id);
 };
 bool kgd2kfd_init(unsigned interface_version,
 		  const struct kfd2kgd_calls *f2g,
 		  const struct kgd2kfd_calls **g2f);
 #endif /* KGD_KFD_INTERFACE_H_INCLUDED */
--- a/drivers/gpu/drm/radeon/Makefile
+++ b/drivers/gpu/drm/radeon/Makefile
@ -104,6 +104,7 @@ radeon-y += \
 	radeon_vce.o \
 	vce_v1_0.o \
 	vce_v2_0.o \
 	radeon_kfd.o
 radeon-$(CONFIG_COMPAT) += radeon_ioc32.o
 radeon-$(CONFIG_VGA_SWITCHEROO) += radeon_atpx_handler.o
--- a/drivers/gpu/drm/radeon/cik.c
+++ b/drivers/gpu/drm/radeon/cik.c
@ -32,6 +32,7 @@
 #include "cik_blit_shaders.h"
 #include "radeon_ucode.h"
 #include "clearstate_ci.h"
 #include "radeon_kfd.h"
 MODULE_FIRMWARE("radeon/BONAIRE_pfp.bin");
 MODULE_FIRMWARE("radeon/BONAIRE_me.bin");
@ -1563,6 +1564,8 @@ static const u32 godavari_golden_registers[] =
 static void cik_init_golden_registers(struct radeon_device *rdev)
 {
 	/* Some of the registers might be dependent on GRBM_GFX_INDEX */
 	mutex_lock(&rdev->grbm_idx_mutex);
 	switch (rdev->family) {
 	case CHIP_BONAIRE:
 		radeon_program_register_sequence(rdev,
@ -1637,6 +1640,7 @@ static void cik_init_golden_registers(struct radeon_device *rdev)
 	default:
 		break;
 	}
 	mutex_unlock(&rdev->grbm_idx_mutex);
 }
 /**
@ -3428,6 +3432,7 @@ static void cik_setup_rb(struct radeon_device *rdev,
 	u32 disabled_rbs = 0;
 	u32 enabled_rbs = 0;
 	mutex_lock(&rdev->grbm_idx_mutex);
 	for (i = 0; i < se_num; i++) {
 		for (j = 0; j < sh_per_se; j++) {
 			cik_select_se_sh(rdev, i, j);
@ -3439,6 +3444,7 @@ static void cik_setup_rb(struct radeon_device *rdev,
 		}
 	}
 	cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 	mutex_unlock(&rdev->grbm_idx_mutex);
 	mask = 1;
 	for (i = 0; i < max_rb_num_per_se * se_num; i++) {
@ -3449,6 +3455,7 @@ static void cik_setup_rb(struct radeon_device *rdev,
 	rdev->config.cik.backend_enable_mask = enabled_rbs;
 	mutex_lock(&rdev->grbm_idx_mutex);
 	for (i = 0; i < se_num; i++) {
 		cik_select_se_sh(rdev, i, 0xffffffff);
 		data = 0;
@ -3476,6 +3483,7 @@ static void cik_setup_rb(struct radeon_device *rdev,
 		WREG32(PA_SC_RASTER_CONFIG, data);
 	}
 	cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 	mutex_unlock(&rdev->grbm_idx_mutex);
 }
 /**
@ -3693,6 +3701,12 @@ static void cik_gpu_init(struct radeon_device *rdev)
 	/* set HW defaults for 3D engine */
 	WREG32(CP_MEQ_THRESHOLDS, MEQ1_START(0x30) | MEQ2_START(0x60));
 	mutex_lock(&rdev->grbm_idx_mutex);
 	/*
 	 * making sure that the following register writes will be broadcasted
 	 * to all the shaders
 	 */
 	cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 	WREG32(SX_DEBUG_1, 0x20);
 	WREG32(TA_CNTL_AUX, 0x00010000);
@ -3748,6 +3762,7 @@ static void cik_gpu_init(struct radeon_device *rdev)
 	WREG32(PA_CL_ENHANCE, CLIP_VTX_REORDER_ENA | NUM_CLIP_SEQ(3));
 	WREG32(PA_SC_ENHANCE, ENABLE_PA_SC_OUT_OF_ORDER);
 	mutex_unlock(&rdev->grbm_idx_mutex);
 	udelay(50);
 }
@ -4684,12 +4699,11 @@ static int cik_mec_init(struct radeon_device *rdev)
 	/*
 	 * KV:    2 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 64 Queues total
 	 * CI/KB: 1 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 32 Queues total
 	 * Nonetheless, we assign only 1 pipe because all other pipes will
 	 * be handled by KFD
 	 */
-	if (rdev->family == CHIP_KAVERI)
+	rdev->mec.num_mec = 1;
-		rdev->mec.num_mec = 2;
+	rdev->mec.num_pipe = 1;
 	else
 		rdev->mec.num_mec = 1;
 	rdev->mec.num_pipe = 4;
 	rdev->mec.num_queue = rdev->mec.num_mec * rdev->mec.num_pipe * 8;
 	if (rdev->mec.hpd_eop_obj == NULL) {
@ -4831,28 +4845,24 @@ static int cik_cp_compute_resume(struct radeon_device *rdev)
 	/* init the pipes */
 	mutex_lock(&rdev->srbm_mutex);
 	for (i = 0; i < (rdev->mec.num_pipe * rdev->mec.num_mec); i++) {
 		int me = (i < 4) ? 1 : 2;
 		int pipe = (i < 4) ? i : (i - 4);
-		eop_gpu_addr = rdev->mec.hpd_eop_gpu_addr + (i * MEC_HPD_SIZE * 2);
+	eop_gpu_addr = rdev->mec.hpd_eop_gpu_addr;
 		cik_srbm_select(rdev, me, pipe, 0, 0);
 		/* write the EOP addr */
 		WREG32(CP_HPD_EOP_BASE_ADDR, eop_gpu_addr >> 8);
 		WREG32(CP_HPD_EOP_BASE_ADDR_HI, upper_32_bits(eop_gpu_addr) >> 8);
 		/* set the VMID assigned */
 		WREG32(CP_HPD_EOP_VMID, 0);
 		/* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
 		tmp = RREG32(CP_HPD_EOP_CONTROL);
 		tmp &= ~EOP_SIZE_MASK;
 		tmp |= order_base_2(MEC_HPD_SIZE / 8);
 		WREG32(CP_HPD_EOP_CONTROL, tmp);
 	}
 	cik_srbm_select(rdev, 0, 0, 0, 0);
 	/* write the EOP addr */
 	WREG32(CP_HPD_EOP_BASE_ADDR, eop_gpu_addr >> 8);
 	WREG32(CP_HPD_EOP_BASE_ADDR_HI, upper_32_bits(eop_gpu_addr) >> 8);
 	/* set the VMID assigned */
 	WREG32(CP_HPD_EOP_VMID, 0);
 	/* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
 	tmp = RREG32(CP_HPD_EOP_CONTROL);
 	tmp &= ~EOP_SIZE_MASK;
 	tmp |= order_base_2(MEC_HPD_SIZE / 8);
 	WREG32(CP_HPD_EOP_CONTROL, tmp);
 	mutex_unlock(&rdev->srbm_mutex);
 	/* init the queues.  Just two for now. */
@ -5906,8 +5916,13 @@ int cik_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
 */
 int cik_vm_init(struct radeon_device *rdev)
 {
-	/* number of VMs */
+	/*
-	rdev->vm_manager.nvm = 16;
+	 * number of VMs
 	 * VMID 0 is reserved for System
 	 * radeon graphics/compute will use VMIDs 1-7
 	 * amdkfd will use VMIDs 8-15
 	 */
 	rdev->vm_manager.nvm = RADEON_NUM_OF_VMIDS;
 	/* base offset of vram pages */
 	if (rdev->flags & RADEON_IS_IGP) {
 		u64 tmp = RREG32(MC_VM_FB_OFFSET);
@ -6068,6 +6083,7 @@ static void cik_wait_for_rlc_serdes(struct radeon_device *rdev)
 	u32 i, j, k;
 	u32 mask;
 	mutex_lock(&rdev->grbm_idx_mutex);
 	for (i = 0; i < rdev->config.cik.max_shader_engines; i++) {
 		for (j = 0; j < rdev->config.cik.max_sh_per_se; j++) {
 			cik_select_se_sh(rdev, i, j);
@ -6079,6 +6095,7 @@ static void cik_wait_for_rlc_serdes(struct radeon_device *rdev)
 		}
 	}
 	cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 	mutex_unlock(&rdev->grbm_idx_mutex);
 	mask = SE_MASTER_BUSY_MASK | GC_MASTER_BUSY | TC0_MASTER_BUSY | TC1_MASTER_BUSY;
 	for (k = 0; k < rdev->usec_timeout; k++) {
@ -6213,10 +6230,12 @@ static int cik_rlc_resume(struct radeon_device *rdev)
 	WREG32(RLC_LB_CNTR_INIT, 0);
 	WREG32(RLC_LB_CNTR_MAX, 0x00008000);
 	mutex_lock(&rdev->grbm_idx_mutex);
 	cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 	WREG32(RLC_LB_INIT_CU_MASK, 0xffffffff);
 	WREG32(RLC_LB_PARAMS, 0x00600408);
 	WREG32(RLC_LB_CNTL, 0x80000004);
 	mutex_unlock(&rdev->grbm_idx_mutex);
 	WREG32(RLC_MC_CNTL, 0);
 	WREG32(RLC_UCODE_CNTL, 0);
@ -6283,11 +6302,13 @@ static void cik_enable_cgcg(struct radeon_device *rdev, bool enable)
 		tmp = cik_halt_rlc(rdev);
 		mutex_lock(&rdev->grbm_idx_mutex);
 		cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 		WREG32(RLC_SERDES_WR_CU_MASTER_MASK, 0xffffffff);
 		WREG32(RLC_SERDES_WR_NONCU_MASTER_MASK, 0xffffffff);
 		tmp2 = BPM_ADDR_MASK | CGCG_OVERRIDE_0 | CGLS_ENABLE;
 		WREG32(RLC_SERDES_WR_CTRL, tmp2);
 		mutex_unlock(&rdev->grbm_idx_mutex);
 		cik_update_rlc(rdev, tmp);
@ -6329,11 +6350,13 @@ static void cik_enable_mgcg(struct radeon_device *rdev, bool enable)
 		tmp = cik_halt_rlc(rdev);
 		mutex_lock(&rdev->grbm_idx_mutex);
 		cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 		WREG32(RLC_SERDES_WR_CU_MASTER_MASK, 0xffffffff);
 		WREG32(RLC_SERDES_WR_NONCU_MASTER_MASK, 0xffffffff);
 		data = BPM_ADDR_MASK | MGCG_OVERRIDE_0;
 		WREG32(RLC_SERDES_WR_CTRL, data);
 		mutex_unlock(&rdev->grbm_idx_mutex);
 		cik_update_rlc(rdev, tmp);
@ -6377,11 +6400,13 @@ static void cik_enable_mgcg(struct radeon_device *rdev, bool enable)
 		tmp = cik_halt_rlc(rdev);
 		mutex_lock(&rdev->grbm_idx_mutex);
 		cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 		WREG32(RLC_SERDES_WR_CU_MASTER_MASK, 0xffffffff);
 		WREG32(RLC_SERDES_WR_NONCU_MASTER_MASK, 0xffffffff);
 		data = BPM_ADDR_MASK | MGCG_OVERRIDE_1;
 		WREG32(RLC_SERDES_WR_CTRL, data);
 		mutex_unlock(&rdev->grbm_idx_mutex);
 		cik_update_rlc(rdev, tmp);
 	}
@ -6810,10 +6835,12 @@ static u32 cik_get_cu_active_bitmap(struct radeon_device *rdev, u32 se, u32 sh)
 	u32 mask = 0, tmp, tmp1;
 	int i;
 	mutex_lock(&rdev->grbm_idx_mutex);
 	cik_select_se_sh(rdev, se, sh);
 	tmp = RREG32(CC_GC_SHADER_ARRAY_CONFIG);
 	tmp1 = RREG32(GC_USER_SHADER_ARRAY_CONFIG);
 	cik_select_se_sh(rdev, 0xffffffff, 0xffffffff);
 	mutex_unlock(&rdev->grbm_idx_mutex);
 	tmp &= 0xffff0000;
@ -7297,8 +7324,7 @@ static int cik_irq_init(struct radeon_device *rdev)
 int cik_irq_set(struct radeon_device *rdev)
 {
 	u32 cp_int_cntl;
-	u32 cp_m1p0, cp_m1p1, cp_m1p2, cp_m1p3;
+	u32 cp_m1p0;
 	u32 cp_m2p0, cp_m2p1, cp_m2p2, cp_m2p3;
 	u32 crtc1 = 0, crtc2 = 0, crtc3 = 0, crtc4 = 0, crtc5 = 0, crtc6 = 0;
 	u32 hpd1, hpd2, hpd3, hpd4, hpd5, hpd6;
 	u32 grbm_int_cntl = 0;
@ -7332,13 +7358,6 @@ int cik_irq_set(struct radeon_device *rdev)
 	dma_cntl1 = RREG32(SDMA0_CNTL + SDMA1_REGISTER_OFFSET) & ~TRAP_ENABLE;
 	cp_m1p0 = RREG32(CP_ME1_PIPE0_INT_CNTL) & ~TIME_STAMP_INT_ENABLE;
 	cp_m1p1 = RREG32(CP_ME1_PIPE1_INT_CNTL) & ~TIME_STAMP_INT_ENABLE;
 	cp_m1p2 = RREG32(CP_ME1_PIPE2_INT_CNTL) & ~TIME_STAMP_INT_ENABLE;
 	cp_m1p3 = RREG32(CP_ME1_PIPE3_INT_CNTL) & ~TIME_STAMP_INT_ENABLE;
 	cp_m2p0 = RREG32(CP_ME2_PIPE0_INT_CNTL) & ~TIME_STAMP_INT_ENABLE;
 	cp_m2p1 = RREG32(CP_ME2_PIPE1_INT_CNTL) & ~TIME_STAMP_INT_ENABLE;
 	cp_m2p2 = RREG32(CP_ME2_PIPE2_INT_CNTL) & ~TIME_STAMP_INT_ENABLE;
 	cp_m2p3 = RREG32(CP_ME2_PIPE3_INT_CNTL) & ~TIME_STAMP_INT_ENABLE;
 	if (rdev->flags & RADEON_IS_IGP)
 		thermal_int = RREG32_SMC(CG_THERMAL_INT_CTRL) &
@ -7360,33 +7379,6 @@ int cik_irq_set(struct radeon_device *rdev)
 			case 0:
 				cp_m1p0 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 1:
 				cp_m1p1 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 2:
 				cp_m1p2 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 3:
 				cp_m1p2 |= TIME_STAMP_INT_ENABLE;
 				break;
 			default:
 				DRM_DEBUG("si_irq_set: sw int cp1 invalid pipe %d\n", ring->pipe);
 				break;
 			}
 		} else if (ring->me == 2) {
 			switch (ring->pipe) {
 			case 0:
 				cp_m2p0 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 1:
 				cp_m2p1 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 2:
 				cp_m2p2 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 3:
 				cp_m2p2 |= TIME_STAMP_INT_ENABLE;
 				break;
 			default:
 				DRM_DEBUG("si_irq_set: sw int cp1 invalid pipe %d\n", ring->pipe);
 				break;
@ -7403,33 +7395,6 @@ int cik_irq_set(struct radeon_device *rdev)
 			case 0:
 				cp_m1p0 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 1:
 				cp_m1p1 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 2:
 				cp_m1p2 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 3:
 				cp_m1p2 |= TIME_STAMP_INT_ENABLE;
 				break;
 			default:
 				DRM_DEBUG("si_irq_set: sw int cp2 invalid pipe %d\n", ring->pipe);
 				break;
 			}
 		} else if (ring->me == 2) {
 			switch (ring->pipe) {
 			case 0:
 				cp_m2p0 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 1:
 				cp_m2p1 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 2:
 				cp_m2p2 |= TIME_STAMP_INT_ENABLE;
 				break;
 			case 3:
 				cp_m2p2 |= TIME_STAMP_INT_ENABLE;
 				break;
 			default:
 				DRM_DEBUG("si_irq_set: sw int cp2 invalid pipe %d\n", ring->pipe);
 				break;
@ -7518,13 +7483,6 @@ int cik_irq_set(struct radeon_device *rdev)
 	WREG32(SDMA0_CNTL + SDMA1_REGISTER_OFFSET, dma_cntl1);
 	WREG32(CP_ME1_PIPE0_INT_CNTL, cp_m1p0);
 	WREG32(CP_ME1_PIPE1_INT_CNTL, cp_m1p1);
 	WREG32(CP_ME1_PIPE2_INT_CNTL, cp_m1p2);
 	WREG32(CP_ME1_PIPE3_INT_CNTL, cp_m1p3);
 	WREG32(CP_ME2_PIPE0_INT_CNTL, cp_m2p0);
 	WREG32(CP_ME2_PIPE1_INT_CNTL, cp_m2p1);
 	WREG32(CP_ME2_PIPE2_INT_CNTL, cp_m2p2);
 	WREG32(CP_ME2_PIPE3_INT_CNTL, cp_m2p3);
 	WREG32(GRBM_INT_CNTL, grbm_int_cntl);
@ -7841,6 +7799,10 @@ int cik_irq_process(struct radeon_device *rdev)
 	while (rptr != wptr) {
 		/* wptr/rptr are in bytes! */
 		ring_index = rptr / 4;
 		radeon_kfd_interrupt(rdev,
 				(const void *) &rdev->ih.ring[ring_index]);
 		src_id =  le32_to_cpu(rdev->ih.ring[ring_index]) & 0xff;
 		src_data = le32_to_cpu(rdev->ih.ring[ring_index + 1]) & 0xfffffff;
 		ring_id = le32_to_cpu(rdev->ih.ring[ring_index + 2]) & 0xff;
@ -8530,6 +8492,10 @@ static int cik_startup(struct radeon_device *rdev)
 	if (r)
 		return r;
 	r = radeon_kfd_resume(rdev);
 	if (r)
 		return r;
 	return 0;
 }
@ -8578,6 +8544,7 @@ int cik_resume(struct radeon_device *rdev)
 */
 int cik_suspend(struct radeon_device *rdev)
 {
 	radeon_kfd_suspend(rdev);
 	radeon_pm_suspend(rdev);
 	dce6_audio_fini(rdev);
 	radeon_vm_manager_fini(rdev);
--- a/drivers/gpu/drm/radeon/cik_reg.h
+++ b/drivers/gpu/drm/radeon/cik_reg.h
@ -147,4 +147,140 @@
 #define CIK_LB_DESKTOP_HEIGHT                     0x6b0c
 #define CP_HQD_IQ_RPTR					0xC970u
 #define AQL_ENABLE					(1U << 0)
 #define IDLE					(1 << 2)
 struct cik_mqd {
 	uint32_t header;
 	uint32_t compute_dispatch_initiator;
 	uint32_t compute_dim_x;
 	uint32_t compute_dim_y;
 	uint32_t compute_dim_z;
 	uint32_t compute_start_x;
 	uint32_t compute_start_y;
 	uint32_t compute_start_z;
 	uint32_t compute_num_thread_x;
 	uint32_t compute_num_thread_y;
 	uint32_t compute_num_thread_z;
 	uint32_t compute_pipelinestat_enable;
 	uint32_t compute_perfcount_enable;
 	uint32_t compute_pgm_lo;
 	uint32_t compute_pgm_hi;
 	uint32_t compute_tba_lo;
 	uint32_t compute_tba_hi;
 	uint32_t compute_tma_lo;
 	uint32_t compute_tma_hi;
 	uint32_t compute_pgm_rsrc1;
 	uint32_t compute_pgm_rsrc2;
 	uint32_t compute_vmid;
 	uint32_t compute_resource_limits;
 	uint32_t compute_static_thread_mgmt_se0;
 	uint32_t compute_static_thread_mgmt_se1;
 	uint32_t compute_tmpring_size;
 	uint32_t compute_static_thread_mgmt_se2;
 	uint32_t compute_static_thread_mgmt_se3;
 	uint32_t compute_restart_x;
 	uint32_t compute_restart_y;
 	uint32_t compute_restart_z;
 	uint32_t compute_thread_trace_enable;
 	uint32_t compute_misc_reserved;
 	uint32_t compute_user_data_0;
 	uint32_t compute_user_data_1;
 	uint32_t compute_user_data_2;
 	uint32_t compute_user_data_3;
 	uint32_t compute_user_data_4;
 	uint32_t compute_user_data_5;
 	uint32_t compute_user_data_6;
 	uint32_t compute_user_data_7;
 	uint32_t compute_user_data_8;
 	uint32_t compute_user_data_9;
 	uint32_t compute_user_data_10;
 	uint32_t compute_user_data_11;
 	uint32_t compute_user_data_12;
 	uint32_t compute_user_data_13;
 	uint32_t compute_user_data_14;
 	uint32_t compute_user_data_15;
 	uint32_t cp_compute_csinvoc_count_lo;
 	uint32_t cp_compute_csinvoc_count_hi;
 	uint32_t cp_mqd_base_addr_lo;
 	uint32_t cp_mqd_base_addr_hi;
 	uint32_t cp_hqd_active;
 	uint32_t cp_hqd_vmid;
 	uint32_t cp_hqd_persistent_state;
 	uint32_t cp_hqd_pipe_priority;
 	uint32_t cp_hqd_queue_priority;
 	uint32_t cp_hqd_quantum;
 	uint32_t cp_hqd_pq_base_lo;
 	uint32_t cp_hqd_pq_base_hi;
 	uint32_t cp_hqd_pq_rptr;
 	uint32_t cp_hqd_pq_rptr_report_addr_lo;
 	uint32_t cp_hqd_pq_rptr_report_addr_hi;
 	uint32_t cp_hqd_pq_wptr_poll_addr_lo;
 	uint32_t cp_hqd_pq_wptr_poll_addr_hi;
 	uint32_t cp_hqd_pq_doorbell_control;
 	uint32_t cp_hqd_pq_wptr;
 	uint32_t cp_hqd_pq_control;
 	uint32_t cp_hqd_ib_base_addr_lo;
 	uint32_t cp_hqd_ib_base_addr_hi;
 	uint32_t cp_hqd_ib_rptr;
 	uint32_t cp_hqd_ib_control;
 	uint32_t cp_hqd_iq_timer;
 	uint32_t cp_hqd_iq_rptr;
 	uint32_t cp_hqd_dequeue_request;
 	uint32_t cp_hqd_dma_offload;
 	uint32_t cp_hqd_sema_cmd;
 	uint32_t cp_hqd_msg_type;
 	uint32_t cp_hqd_atomic0_preop_lo;
 	uint32_t cp_hqd_atomic0_preop_hi;
 	uint32_t cp_hqd_atomic1_preop_lo;
 	uint32_t cp_hqd_atomic1_preop_hi;
 	uint32_t cp_hqd_hq_status0;
 	uint32_t cp_hqd_hq_control0;
 	uint32_t cp_mqd_control;
 	uint32_t cp_mqd_query_time_lo;
 	uint32_t cp_mqd_query_time_hi;
 	uint32_t cp_mqd_connect_start_time_lo;
 	uint32_t cp_mqd_connect_start_time_hi;
 	uint32_t cp_mqd_connect_end_time_lo;
 	uint32_t cp_mqd_connect_end_time_hi;
 	uint32_t cp_mqd_connect_end_wf_count;
 	uint32_t cp_mqd_connect_end_pq_rptr;
 	uint32_t cp_mqd_connect_end_pq_wptr;
 	uint32_t cp_mqd_connect_end_ib_rptr;
 	uint32_t reserved_96;
 	uint32_t reserved_97;
 	uint32_t reserved_98;
 	uint32_t reserved_99;
 	uint32_t iqtimer_pkt_header;
 	uint32_t iqtimer_pkt_dw0;
 	uint32_t iqtimer_pkt_dw1;
 	uint32_t iqtimer_pkt_dw2;
 	uint32_t iqtimer_pkt_dw3;
 	uint32_t iqtimer_pkt_dw4;
 	uint32_t iqtimer_pkt_dw5;
 	uint32_t iqtimer_pkt_dw6;
 	uint32_t reserved_108;
 	uint32_t reserved_109;
 	uint32_t reserved_110;
 	uint32_t reserved_111;
 	uint32_t queue_doorbell_id0;
 	uint32_t queue_doorbell_id1;
 	uint32_t queue_doorbell_id2;
 	uint32_t queue_doorbell_id3;
 	uint32_t queue_doorbell_id4;
 	uint32_t queue_doorbell_id5;
 	uint32_t queue_doorbell_id6;
 	uint32_t queue_doorbell_id7;
 	uint32_t queue_doorbell_id8;
 	uint32_t queue_doorbell_id9;
 	uint32_t queue_doorbell_id10;
 	uint32_t queue_doorbell_id11;
 	uint32_t queue_doorbell_id12;
 	uint32_t queue_doorbell_id13;
 	uint32_t queue_doorbell_id14;
 	uint32_t queue_doorbell_id15;
 };
 #endif
--- a/drivers/gpu/drm/radeon/cikd.h
+++ b/drivers/gpu/drm/radeon/cikd.h
@ -30,6 +30,8 @@
 #define CIK_RB_BITMAP_WIDTH_PER_SH     2
 #define HAWAII_RB_BITMAP_WIDTH_PER_SH  4
 #define RADEON_NUM_OF_VMIDS	8
 /* DIDT IND registers */
 #define DIDT_SQ_CTRL0                                     0x0
 #       define DIDT_CTRL_EN                               (1 << 0)
@ -1137,6 +1139,9 @@
 #define			SH_MEM_ALIGNMENT_MODE_UNALIGNED			3
 #define		DEFAULT_MTYPE(x)				((x) << 4)
 #define		APE1_MTYPE(x)					((x) << 7)
 /* valid for both DEFAULT_MTYPE and APE1_MTYPE */
 #define	MTYPE_CACHED					0
 #define	MTYPE_NONCACHED					3
 #define	SX_DEBUG_1					0x9060
@ -1447,6 +1452,16 @@
 #define CP_HQD_ACTIVE                                     0xC91C
 #define CP_HQD_VMID                                       0xC920
 #define CP_HQD_PERSISTENT_STATE				0xC924u
 #define	DEFAULT_CP_HQD_PERSISTENT_STATE			(0x33U << 8)
 #define CP_HQD_PIPE_PRIORITY				0xC928u
 #define CP_HQD_QUEUE_PRIORITY				0xC92Cu
 #define CP_HQD_QUANTUM					0xC930u
 #define	QUANTUM_EN					1U
 #define	QUANTUM_SCALE_1MS				(1U << 4)
 #define	QUANTUM_DURATION(x)				((x) << 8)
 #define CP_HQD_PQ_BASE                                    0xC934
 #define CP_HQD_PQ_BASE_HI                                 0xC938
 #define CP_HQD_PQ_RPTR                                    0xC93C
@ -1474,12 +1489,32 @@
 #define		PRIV_STATE      			(1 << 30)
 #define		KMD_QUEUE      				(1 << 31)
-#define CP_HQD_DEQUEUE_REQUEST                          0xC974
+#define CP_HQD_IB_BASE_ADDR				0xC95Cu
 #define CP_HQD_IB_BASE_ADDR_HI			0xC960u
 #define CP_HQD_IB_RPTR					0xC964u
 #define CP_HQD_IB_CONTROL				0xC968u
 #define	IB_ATC_EN					(1U << 23)
 #define	DEFAULT_MIN_IB_AVAIL_SIZE			(3U << 20)
 #define CP_HQD_DEQUEUE_REQUEST			0xC974
 #define	DEQUEUE_REQUEST_DRAIN				1
 #define DEQUEUE_REQUEST_RESET				2
 #define CP_MQD_CONTROL                                  0xC99C
 #define		MQD_VMID(x)				((x) << 0)
 #define		MQD_VMID_MASK      			(0xf << 0)
 #define CP_HQD_SEMA_CMD					0xC97Cu
 #define CP_HQD_MSG_TYPE					0xC980u
 #define CP_HQD_ATOMIC0_PREOP_LO			0xC984u
 #define CP_HQD_ATOMIC0_PREOP_HI			0xC988u
 #define CP_HQD_ATOMIC1_PREOP_LO			0xC98Cu
 #define CP_HQD_ATOMIC1_PREOP_HI			0xC990u
 #define CP_HQD_HQ_SCHEDULER0			0xC994u
 #define CP_HQD_HQ_SCHEDULER1			0xC998u
 #define SH_STATIC_MEM_CONFIG			0x9604u
 #define DB_RENDER_CONTROL                               0x28000
 #define PA_SC_RASTER_CONFIG                             0x28350
@ -2069,4 +2104,20 @@
 #define VCE_CMD_IB_AUTO		0x00000005
 #define VCE_CMD_SEMAPHORE	0x00000006
 #define ATC_VMID0_PASID_MAPPING					0x339Cu
 #define	ATC_VMID_PASID_MAPPING_UPDATE_STATUS	0x3398u
 #define	ATC_VMID_PASID_MAPPING_VALID				(1U << 31)
 #define ATC_VM_APERTURE0_CNTL					0x3310u
 #define	ATS_ACCESS_MODE_NEVER						0
 #define	ATS_ACCESS_MODE_ALWAYS						1
 #define ATC_VM_APERTURE0_CNTL2					0x3318u
 #define ATC_VM_APERTURE0_HIGH_ADDR				0x3308u
 #define ATC_VM_APERTURE0_LOW_ADDR				0x3300u
 #define ATC_VM_APERTURE1_CNTL					0x3314u
 #define ATC_VM_APERTURE1_CNTL2					0x331Cu
 #define ATC_VM_APERTURE1_HIGH_ADDR				0x330Cu
 #define ATC_VM_APERTURE1_LOW_ADDR				0x3304u
 #endif
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@ -701,6 +701,10 @@ struct radeon_doorbell {
 int radeon_doorbell_get(struct radeon_device *rdev, u32 *page);
 void radeon_doorbell_free(struct radeon_device *rdev, u32 doorbell);
 void radeon_doorbell_get_kfd_info(struct radeon_device *rdev,
 				  phys_addr_t *aperture_base,
 				  size_t *aperture_size,
 				  size_t *start_offset);
 /*
 * IRQS.
@ -2393,6 +2397,8 @@ struct radeon_device {
 	struct radeon_atcs		atcs;
 	/* srbm instance registers */
 	struct mutex			srbm_mutex;
 	/* GRBM index mutex. Protects concurrents access to GRBM index */
 	struct mutex			grbm_idx_mutex;
 	/* clock, powergating flags */
 	u32 cg_flags;
 	u32 pg_flags;
@ -2405,6 +2411,10 @@ struct radeon_device {
 	u64 vram_pin_size;
 	u64 gart_pin_size;
 	/* amdkfd interface */
 	struct kfd_dev		*kfd;
 	struct radeon_sa_manager	kfd_bo;
 	struct mutex	mn_lock;
 	DECLARE_HASHTABLE(mn_hash, 7);
 };
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@ -377,6 +377,37 @@ void radeon_doorbell_free(struct radeon_device *rdev, u32 doorbell)
 		__clear_bit(doorbell, rdev->doorbell.used);
 }
 /**
 * radeon_doorbell_get_kfd_info - Report doorbell configuration required to
 *                                setup KFD
 *
 * @rdev: radeon_device pointer
 * @aperture_base: output returning doorbell aperture base physical address
 * @aperture_size: output returning doorbell aperture size in bytes
 * @start_offset: output returning # of doorbell bytes reserved for radeon.
 *
 * Radeon and the KFD share the doorbell aperture. Radeon sets it up,
 * takes doorbells required for its own rings and reports the setup to KFD.
 * Radeon reserved doorbells are at the start of the doorbell aperture.
 */
 void radeon_doorbell_get_kfd_info(struct radeon_device *rdev,
 				  phys_addr_t *aperture_base,
 				  size_t *aperture_size,
 				  size_t *start_offset)
 {
 	/* The first num_doorbells are used by radeon.
 	 * KFD takes whatever's left in the aperture. */
 	if (rdev->doorbell.size > rdev->doorbell.num_doorbells * sizeof(u32)) {
 		*aperture_base = rdev->doorbell.base;
 		*aperture_size = rdev->doorbell.size;
 		*start_offset = rdev->doorbell.num_doorbells * sizeof(u32);
 	} else {
 		*aperture_base = 0;
 		*aperture_size = 0;
 		*start_offset = 0;
 	}
 }
 /*
 * radeon_wb_*()
 * Writeback is the the method by which the the GPU updates special pages
@ -1272,6 +1303,7 @@ int radeon_device_init(struct radeon_device *rdev,
 	mutex_init(&rdev->pm.mutex);
 	mutex_init(&rdev->gpu_clock_mutex);
 	mutex_init(&rdev->srbm_mutex);
 	mutex_init(&rdev->grbm_idx_mutex);
 	init_rwsem(&rdev->pm.mclk_lock);
 	init_rwsem(&rdev->exclusive_lock);
 	init_waitqueue_head(&rdev->irq.vblank_queue);
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@ -41,6 +41,8 @@
 #include <drm/drm_gem.h>
 #include "drm_crtc_helper.h"
 #include "radeon_kfd.h"
 /*
 * KMS wrapper.
 * - 2.0.0 - initial interface
@ -654,12 +656,15 @@ static int __init radeon_init(void)
 #endif
 	}
 	radeon_kfd_init();
 	/* let modprobe override vga console setting */
 	return drm_pci_init(driver, pdriver);
 }
 static void __exit radeon_exit(void)
 {
 	radeon_kfd_fini();
 	drm_pci_exit(driver, pdriver);
 	radeon_unregister_atpx_handler();
 }
--- a/drivers/gpu/drm/radeon/radeon_kfd.c
+++ b/drivers/gpu/drm/radeon/radeon_kfd.c
@ -0,0 +1,563 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <linux/module.h>
 #include <linux/fdtable.h>
 #include <linux/uaccess.h>
 #include <drm/drmP.h>
 #include "radeon.h"
 #include "cikd.h"
 #include "cik_reg.h"
 #include "radeon_kfd.h"
 #define CIK_PIPE_PER_MEC	(4)
 struct kgd_mem {
 	struct radeon_sa_bo *sa_bo;
 	uint64_t gpu_addr;
 	void *ptr;
 };
 static int init_sa_manager(struct kgd_dev *kgd, unsigned int size);
 static void fini_sa_manager(struct kgd_dev *kgd);
 static int allocate_mem(struct kgd_dev *kgd, size_t size, size_t alignment,
 		enum kgd_memory_pool pool, struct kgd_mem **mem);
 static void free_mem(struct kgd_dev *kgd, struct kgd_mem *mem);
 static uint64_t get_vmem_size(struct kgd_dev *kgd);
 static uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
 static uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
 /*
 * Register access functions
 */
 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 		uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
 		uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
 					unsigned int vmid);
 static int kgd_init_memory(struct kgd_dev *kgd);
 static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
 				uint32_t hpd_size, uint64_t hpd_gpu_addr);
 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 			uint32_t queue_id, uint32_t __user *wptr);
 static bool kgd_hqd_is_occupies(struct kgd_dev *kgd, uint64_t queue_address,
 				uint32_t pipe_id, uint32_t queue_id);
 static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
 				unsigned int timeout, uint32_t pipe_id,
 				uint32_t queue_id);
 static const struct kfd2kgd_calls kfd2kgd = {
 	.init_sa_manager = init_sa_manager,
 	.fini_sa_manager = fini_sa_manager,
 	.allocate_mem = allocate_mem,
 	.free_mem = free_mem,
 	.get_vmem_size = get_vmem_size,
 	.get_gpu_clock_counter = get_gpu_clock_counter,
 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
 	.program_sh_mem_settings = kgd_program_sh_mem_settings,
 	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
 	.init_memory = kgd_init_memory,
 	.init_pipeline = kgd_init_pipeline,
 	.hqd_load = kgd_hqd_load,
 	.hqd_is_occupies = kgd_hqd_is_occupies,
 	.hqd_destroy = kgd_hqd_destroy,
 };
 static const struct kgd2kfd_calls *kgd2kfd;
 bool radeon_kfd_init(void)
 {
 	bool (*kgd2kfd_init_p)(unsigned, const struct kfd2kgd_calls*,
 				const struct kgd2kfd_calls**);
 	kgd2kfd_init_p = symbol_request(kgd2kfd_init);
 	if (kgd2kfd_init_p == NULL)
 		return false;
 	if (!kgd2kfd_init_p(KFD_INTERFACE_VERSION, &kfd2kgd, &kgd2kfd)) {
 		symbol_put(kgd2kfd_init);
 		kgd2kfd = NULL;
 		return false;
 	}
 	return true;
 }
 void radeon_kfd_fini(void)
 {
 	if (kgd2kfd) {
 		kgd2kfd->exit();
 		symbol_put(kgd2kfd_init);
 	}
 }
 void radeon_kfd_device_probe(struct radeon_device *rdev)
 {
 	if (kgd2kfd)
 		rdev->kfd = kgd2kfd->probe((struct kgd_dev *)rdev, rdev->pdev);
 }
 void radeon_kfd_device_init(struct radeon_device *rdev)
 {
 	if (rdev->kfd) {
 		struct kgd2kfd_shared_resources gpu_resources = {
 			.compute_vmid_bitmap = 0xFF00,
 			.first_compute_pipe = 1,
 			.compute_pipe_count = 8 - 1,
 		};
 		radeon_doorbell_get_kfd_info(rdev,
 				&gpu_resources.doorbell_physical_address,
 				&gpu_resources.doorbell_aperture_size,
 				&gpu_resources.doorbell_start_offset);
 		kgd2kfd->device_init(rdev->kfd, &gpu_resources);
 	}
 }
 void radeon_kfd_device_fini(struct radeon_device *rdev)
 {
 	if (rdev->kfd) {
 		kgd2kfd->device_exit(rdev->kfd);
 		rdev->kfd = NULL;
 	}
 }
 void radeon_kfd_interrupt(struct radeon_device *rdev, const void *ih_ring_entry)
 {
 	if (rdev->kfd)
 		kgd2kfd->interrupt(rdev->kfd, ih_ring_entry);
 }
 void radeon_kfd_suspend(struct radeon_device *rdev)
 {
 	if (rdev->kfd)
 		kgd2kfd->suspend(rdev->kfd);
 }
 int radeon_kfd_resume(struct radeon_device *rdev)
 {
 	int r = 0;
 	if (rdev->kfd)
 		r = kgd2kfd->resume(rdev->kfd);
 	return r;
 }
 static u32 pool_to_domain(enum kgd_memory_pool p)
 {
 	switch (p) {
 	case KGD_POOL_FRAMEBUFFER: return RADEON_GEM_DOMAIN_VRAM;
 	default: return RADEON_GEM_DOMAIN_GTT;
 	}
 }
 static int init_sa_manager(struct kgd_dev *kgd, unsigned int size)
 {
 	struct radeon_device *rdev = (struct radeon_device *)kgd;
 	int r;
 	BUG_ON(kgd == NULL);
 	r = radeon_sa_bo_manager_init(rdev, &rdev->kfd_bo,
 				      size,
 				      RADEON_GPU_PAGE_SIZE,
 				      RADEON_GEM_DOMAIN_GTT,
 				      RADEON_GEM_GTT_WC);
 	if (r)
 		return r;
 	r = radeon_sa_bo_manager_start(rdev, &rdev->kfd_bo);
 	if (r)
 		radeon_sa_bo_manager_fini(rdev, &rdev->kfd_bo);
 	return r;
 }
 static void fini_sa_manager(struct kgd_dev *kgd)
 {
 	struct radeon_device *rdev = (struct radeon_device *)kgd;
 	BUG_ON(kgd == NULL);
 	radeon_sa_bo_manager_suspend(rdev, &rdev->kfd_bo);
 	radeon_sa_bo_manager_fini(rdev, &rdev->kfd_bo);
 }
 static int allocate_mem(struct kgd_dev *kgd, size_t size, size_t alignment,
 		enum kgd_memory_pool pool, struct kgd_mem **mem)
 {
 	struct radeon_device *rdev = (struct radeon_device *)kgd;
 	u32 domain;
 	int r;
 	BUG_ON(kgd == NULL);
 	domain = pool_to_domain(pool);
 	if (domain != RADEON_GEM_DOMAIN_GTT) {
 		dev_err(rdev->dev,
 			"Only allowed to allocate gart memory for kfd\n");
 		return -EINVAL;
 	}
 	*mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL);
 	if ((*mem) == NULL)
 		return -ENOMEM;
 	r = radeon_sa_bo_new(rdev, &rdev->kfd_bo, &(*mem)->sa_bo, size,
 				alignment);
 	if (r) {
 		dev_err(rdev->dev, "failed to get memory for kfd (%d)\n", r);
 		return r;
 	}
 	(*mem)->ptr = radeon_sa_bo_cpu_addr((*mem)->sa_bo);
 	(*mem)->gpu_addr = radeon_sa_bo_gpu_addr((*mem)->sa_bo);
 	return 0;
 }
 static void free_mem(struct kgd_dev *kgd, struct kgd_mem *mem)
 {
 	struct radeon_device *rdev = (struct radeon_device *)kgd;
 	BUG_ON(kgd == NULL);
 	radeon_sa_bo_free(rdev, &mem->sa_bo, NULL);
 	kfree(mem);
 }
 static uint64_t get_vmem_size(struct kgd_dev *kgd)
 {
 	struct radeon_device *rdev = (struct radeon_device *)kgd;
 	BUG_ON(kgd == NULL);
 	return rdev->mc.real_vram_size;
 }
 static uint64_t get_gpu_clock_counter(struct kgd_dev *kgd)
 {
 	struct radeon_device *rdev = (struct radeon_device *)kgd;
 	return rdev->asic->get_gpu_clock_counter(rdev);
 }
 static uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd)
 {
 	struct radeon_device *rdev = (struct radeon_device *)kgd;
 	/* The sclk is in quantas of 10kHz */
 	return rdev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100;
 }
 static inline struct radeon_device *get_radeon_device(struct kgd_dev *kgd)
 {
 	return (struct radeon_device *)kgd;
 }
 static void write_register(struct kgd_dev *kgd, uint32_t offset, uint32_t value)
 {
 	struct radeon_device *rdev = get_radeon_device(kgd);
 	writel(value, (void __iomem *)(rdev->rmmio + offset));
 }
 static uint32_t read_register(struct kgd_dev *kgd, uint32_t offset)
 {
 	struct radeon_device *rdev = get_radeon_device(kgd);
 	return readl((void __iomem *)(rdev->rmmio + offset));
 }
 static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
 			uint32_t queue, uint32_t vmid)
 {
 	struct radeon_device *rdev = get_radeon_device(kgd);
 	uint32_t value = PIPEID(pipe) | MEID(mec) | VMID(vmid) | QUEUEID(queue);
 	mutex_lock(&rdev->srbm_mutex);
 	write_register(kgd, SRBM_GFX_CNTL, value);
 }
 static void unlock_srbm(struct kgd_dev *kgd)
 {
 	struct radeon_device *rdev = get_radeon_device(kgd);
 	write_register(kgd, SRBM_GFX_CNTL, 0);
 	mutex_unlock(&rdev->srbm_mutex);
 }
 static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
 				uint32_t queue_id)
 {
 	uint32_t mec = (++pipe_id / CIK_PIPE_PER_MEC) + 1;
 	uint32_t pipe = (pipe_id % CIK_PIPE_PER_MEC);
 	lock_srbm(kgd, mec, pipe, queue_id, 0);
 }
 static void release_queue(struct kgd_dev *kgd)
 {
 	unlock_srbm(kgd);
 }
 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
 					uint32_t sh_mem_config,
 					uint32_t sh_mem_ape1_base,
 					uint32_t sh_mem_ape1_limit,
 					uint32_t sh_mem_bases)
 {
 	lock_srbm(kgd, 0, 0, 0, vmid);
 	write_register(kgd, SH_MEM_CONFIG, sh_mem_config);
 	write_register(kgd, SH_MEM_APE1_BASE, sh_mem_ape1_base);
 	write_register(kgd, SH_MEM_APE1_LIMIT, sh_mem_ape1_limit);
 	write_register(kgd, SH_MEM_BASES, sh_mem_bases);
 	unlock_srbm(kgd);
 }
 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
 					unsigned int vmid)
 {
 	/*
 	 * We have to assume that there is no outstanding mapping.
 	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0
 	 * because a mapping is in progress or because a mapping finished and
 	 * the SW cleared it.
 	 * So the protocol is to always wait & clear.
 	 */
 	uint32_t pasid_mapping = (pasid == 0) ? 0 :
 				(uint32_t)pasid | ATC_VMID_PASID_MAPPING_VALID;
 	write_register(kgd, ATC_VMID0_PASID_MAPPING + vmid*sizeof(uint32_t),
 			pasid_mapping);
 	while (!(read_register(kgd, ATC_VMID_PASID_MAPPING_UPDATE_STATUS) &
 								(1U << vmid)))
 		cpu_relax();
 	write_register(kgd, ATC_VMID_PASID_MAPPING_UPDATE_STATUS, 1U << vmid);
 	return 0;
 }
 static int kgd_init_memory(struct kgd_dev *kgd)
 {
 	/*
 	 * Configure apertures:
 	 * LDS:         0x60000000'00000000 - 0x60000001'00000000 (4GB)
 	 * Scratch:     0x60000001'00000000 - 0x60000002'00000000 (4GB)
 	 * GPUVM:       0x60010000'00000000 - 0x60020000'00000000 (1TB)
 	 */
 	int i;
 	uint32_t sh_mem_bases = PRIVATE_BASE(0x6000) | SHARED_BASE(0x6000);
 	for (i = 8; i < 16; i++) {
 		uint32_t sh_mem_config;
 		lock_srbm(kgd, 0, 0, 0, i);
 		sh_mem_config = ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED);
 		sh_mem_config |= DEFAULT_MTYPE(MTYPE_NONCACHED);
 		write_register(kgd, SH_MEM_CONFIG, sh_mem_config);
 		write_register(kgd, SH_MEM_BASES, sh_mem_bases);
 		/* Scratch aperture is not supported for now. */
 		write_register(kgd, SH_STATIC_MEM_CONFIG, 0);
 		/* APE1 disabled for now. */
 		write_register(kgd, SH_MEM_APE1_BASE, 1);
 		write_register(kgd, SH_MEM_APE1_LIMIT, 0);
 		unlock_srbm(kgd);
 	}
 	return 0;
 }
 static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
 				uint32_t hpd_size, uint64_t hpd_gpu_addr)
 {
 	uint32_t mec = (++pipe_id / CIK_PIPE_PER_MEC) + 1;
 	uint32_t pipe = (pipe_id % CIK_PIPE_PER_MEC);
 	lock_srbm(kgd, mec, pipe, 0, 0);
 	write_register(kgd, CP_HPD_EOP_BASE_ADDR,
 			lower_32_bits(hpd_gpu_addr >> 8));
 	write_register(kgd, CP_HPD_EOP_BASE_ADDR_HI,
 			upper_32_bits(hpd_gpu_addr >> 8));
 	write_register(kgd, CP_HPD_EOP_VMID, 0);
 	write_register(kgd, CP_HPD_EOP_CONTROL, hpd_size);
 	unlock_srbm(kgd);
 	return 0;
 }
 static inline struct cik_mqd *get_mqd(void *mqd)
 {
 	return (struct cik_mqd *)mqd;
 }
 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 			uint32_t queue_id, uint32_t __user *wptr)
 {
 	uint32_t wptr_shadow, is_wptr_shadow_valid;
 	struct cik_mqd *m;
 	m = get_mqd(mqd);
 	is_wptr_shadow_valid = !get_user(wptr_shadow, wptr);
 	acquire_queue(kgd, pipe_id, queue_id);
 	write_register(kgd, CP_MQD_BASE_ADDR, m->cp_mqd_base_addr_lo);
 	write_register(kgd, CP_MQD_BASE_ADDR_HI, m->cp_mqd_base_addr_hi);
 	write_register(kgd, CP_MQD_CONTROL, m->cp_mqd_control);
 	write_register(kgd, CP_HQD_PQ_BASE, m->cp_hqd_pq_base_lo);
 	write_register(kgd, CP_HQD_PQ_BASE_HI, m->cp_hqd_pq_base_hi);
 	write_register(kgd, CP_HQD_PQ_CONTROL, m->cp_hqd_pq_control);
 	write_register(kgd, CP_HQD_IB_CONTROL, m->cp_hqd_ib_control);
 	write_register(kgd, CP_HQD_IB_BASE_ADDR, m->cp_hqd_ib_base_addr_lo);
 	write_register(kgd, CP_HQD_IB_BASE_ADDR_HI, m->cp_hqd_ib_base_addr_hi);
 	write_register(kgd, CP_HQD_IB_RPTR, m->cp_hqd_ib_rptr);
 	write_register(kgd, CP_HQD_PERSISTENT_STATE,
 			m->cp_hqd_persistent_state);
 	write_register(kgd, CP_HQD_SEMA_CMD, m->cp_hqd_sema_cmd);
 	write_register(kgd, CP_HQD_MSG_TYPE, m->cp_hqd_msg_type);
 	write_register(kgd, CP_HQD_ATOMIC0_PREOP_LO,
 			m->cp_hqd_atomic0_preop_lo);
 	write_register(kgd, CP_HQD_ATOMIC0_PREOP_HI,
 			m->cp_hqd_atomic0_preop_hi);
 	write_register(kgd, CP_HQD_ATOMIC1_PREOP_LO,
 			m->cp_hqd_atomic1_preop_lo);
 	write_register(kgd, CP_HQD_ATOMIC1_PREOP_HI,
 			m->cp_hqd_atomic1_preop_hi);
 	write_register(kgd, CP_HQD_PQ_RPTR_REPORT_ADDR,
 			m->cp_hqd_pq_rptr_report_addr_lo);
 	write_register(kgd, CP_HQD_PQ_RPTR_REPORT_ADDR_HI,
 			m->cp_hqd_pq_rptr_report_addr_hi);
 	write_register(kgd, CP_HQD_PQ_RPTR, m->cp_hqd_pq_rptr);
 	write_register(kgd, CP_HQD_PQ_WPTR_POLL_ADDR,
 			m->cp_hqd_pq_wptr_poll_addr_lo);
 	write_register(kgd, CP_HQD_PQ_WPTR_POLL_ADDR_HI,
 			m->cp_hqd_pq_wptr_poll_addr_hi);
 	write_register(kgd, CP_HQD_PQ_DOORBELL_CONTROL,
 			m->cp_hqd_pq_doorbell_control);
 	write_register(kgd, CP_HQD_VMID, m->cp_hqd_vmid);
 	write_register(kgd, CP_HQD_QUANTUM, m->cp_hqd_quantum);
 	write_register(kgd, CP_HQD_PIPE_PRIORITY, m->cp_hqd_pipe_priority);
 	write_register(kgd, CP_HQD_QUEUE_PRIORITY, m->cp_hqd_queue_priority);
 	write_register(kgd, CP_HQD_IQ_RPTR, m->cp_hqd_iq_rptr);
 	if (is_wptr_shadow_valid)
 		write_register(kgd, CP_HQD_PQ_WPTR, wptr_shadow);
 	write_register(kgd, CP_HQD_ACTIVE, m->cp_hqd_active);
 	release_queue(kgd);
 	return 0;
 }
 static bool kgd_hqd_is_occupies(struct kgd_dev *kgd, uint64_t queue_address,
 				uint32_t pipe_id, uint32_t queue_id)
 {
 	uint32_t act;
 	bool retval = false;
 	uint32_t low, high;
 	acquire_queue(kgd, pipe_id, queue_id);
 	act = read_register(kgd, CP_HQD_ACTIVE);
 	if (act) {
 		low = lower_32_bits(queue_address >> 8);
 		high = upper_32_bits(queue_address >> 8);
 		if (low == read_register(kgd, CP_HQD_PQ_BASE) &&
 				high == read_register(kgd, CP_HQD_PQ_BASE_HI))
 			retval = true;
 	}
 	release_queue(kgd);
 	return retval;
 }
 static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
 				unsigned int timeout, uint32_t pipe_id,
 				uint32_t queue_id)
 {
 	uint32_t temp;
 	acquire_queue(kgd, pipe_id, queue_id);
 	write_register(kgd, CP_HQD_PQ_DOORBELL_CONTROL, 0);
 	write_register(kgd, CP_HQD_DEQUEUE_REQUEST, reset_type);
 	while (true) {
 		temp = read_register(kgd, CP_HQD_ACTIVE);
 		if (temp & 0x1)
 			break;
 		if (timeout == 0) {
 			pr_err("kfd: cp queue preemption time out (%dms)\n",
 				temp);
 			return -ETIME;
 		}
 		msleep(20);
 		timeout -= 20;
 	}
 	release_queue(kgd);
 	return 0;
 }
--- a/drivers/gpu/drm/radeon/radeon_kfd.h
+++ b/drivers/gpu/drm/radeon/radeon_kfd.h
@ -0,0 +1,47 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 /*
 * radeon_kfd.h defines the private interface between the
 * AMD kernel graphics drivers and the AMD KFD.
 */
 #ifndef RADEON_KFD_H_INCLUDED
 #define RADEON_KFD_H_INCLUDED
 #include <linux/types.h>
 #include "../amd/include/kgd_kfd_interface.h"
 struct radeon_device;
 bool radeon_kfd_init(void);
 void radeon_kfd_fini(void);
 void radeon_kfd_suspend(struct radeon_device *rdev);
 int radeon_kfd_resume(struct radeon_device *rdev);
 void radeon_kfd_interrupt(struct radeon_device *rdev,
 			const void *ih_ring_entry);
 void radeon_kfd_device_probe(struct radeon_device *rdev);
 void radeon_kfd_device_init(struct radeon_device *rdev);
 void radeon_kfd_device_fini(struct radeon_device *rdev);
 #endif /* RADEON_KFD_H_INCLUDED */
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@ -34,6 +34,8 @@
 #include <linux/slab.h>
 #include <linux/pm_runtime.h>
 #include "radeon_kfd.h"
 #if defined(CONFIG_VGA_SWITCHEROO)
 bool radeon_has_atpx(void);
 #else
@ -63,6 +65,8 @@ int radeon_driver_unload_kms(struct drm_device *dev)
 	pm_runtime_get_sync(dev->dev);
 	radeon_kfd_device_fini(rdev);
 	radeon_acpi_fini(rdev);
 	radeon_modeset_fini(rdev);
@ -142,6 +146,9 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags)
 				"Error during ACPI methods call\n");
 	}
 	radeon_kfd_device_probe(rdev);
 	radeon_kfd_device_init(rdev);
 	if (radeon_is_px(dev)) {
 		pm_runtime_use_autosuspend(dev->dev);
 		pm_runtime_set_autosuspend_delay(dev->dev, 5000);
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@ -92,13 +92,6 @@ static spinlock_t state_lock;
 static struct workqueue_struct *iommu_wq;
 /*
 * Empty page table - Used between
 * mmu_notifier_invalidate_range_start and
 * mmu_notifier_invalidate_range_end
 */
 static u64 *empty_page_table;
 static void free_pasid_states(struct device_state *dev_state);
 static u16 device_id(struct pci_dev *pdev)
@ -279,10 +272,8 @@ static void free_pasid_state(struct pasid_state *pasid_state)
 static void put_pasid_state(struct pasid_state *pasid_state)
 {
-	if (atomic_dec_and_test(&pasid_state->count)) {
+	if (atomic_dec_and_test(&pasid_state->count))
 		put_device_state(pasid_state->device_state);
 		wake_up(&pasid_state->wq);
 	}
 }
 static void put_pasid_state_wait(struct pasid_state *pasid_state)
@ -291,9 +282,7 @@ static void put_pasid_state_wait(struct pasid_state *pasid_state)
 	prepare_to_wait(&pasid_state->wq, &wait, TASK_UNINTERRUPTIBLE);
-	if (atomic_dec_and_test(&pasid_state->count))
+	if (!atomic_dec_and_test(&pasid_state->count))
 		put_device_state(pasid_state->device_state);
 	else
 		schedule();
 	finish_wait(&pasid_state->wq, &wait);
@ -418,46 +407,21 @@ static void mn_invalidate_page(struct mmu_notifier *mn,
 	__mn_flush_page(mn, address);
 }
-static void mn_invalidate_range_start(struct mmu_notifier *mn,
+static void mn_invalidate_range(struct mmu_notifier *mn,
-				      struct mm_struct *mm,
+				struct mm_struct *mm,
-				      unsigned long start, unsigned long end)
+				unsigned long start, unsigned long end)
 {
 	struct pasid_state *pasid_state;
 	struct device_state *dev_state;
 	unsigned long flags;
 	pasid_state = mn_to_state(mn);
 	dev_state   = pasid_state->device_state;
-	spin_lock_irqsave(&pasid_state->lock, flags);
+	if ((start ^ (end - 1)) < PAGE_SIZE)
-	if (pasid_state->mmu_notifier_count == 0) {
+		amd_iommu_flush_page(dev_state->domain, pasid_state->pasid,
-		amd_iommu_domain_set_gcr3(dev_state->domain,
+				     start);
-					  pasid_state->pasid,
+	else
-					  __pa(empty_page_table));
+		amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid);
 	}
 	pasid_state->mmu_notifier_count += 1;
 	spin_unlock_irqrestore(&pasid_state->lock, flags);
 }
 static void mn_invalidate_range_end(struct mmu_notifier *mn,
 				    struct mm_struct *mm,
 				    unsigned long start, unsigned long end)
 {
 	struct pasid_state *pasid_state;
 	struct device_state *dev_state;
 	unsigned long flags;
 	pasid_state = mn_to_state(mn);
 	dev_state   = pasid_state->device_state;
 	spin_lock_irqsave(&pasid_state->lock, flags);
 	pasid_state->mmu_notifier_count -= 1;
 	if (pasid_state->mmu_notifier_count == 0) {
 		amd_iommu_domain_set_gcr3(dev_state->domain,
 					  pasid_state->pasid,
 					  __pa(pasid_state->mm->pgd));
 	}
 	spin_unlock_irqrestore(&pasid_state->lock, flags);
 }
 static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
@ -482,8 +446,7 @@ static struct mmu_notifier_ops iommu_mn = {
 	.release		= mn_release,
 	.clear_flush_young      = mn_clear_flush_young,
 	.invalidate_page        = mn_invalidate_page,
-	.invalidate_range_start = mn_invalidate_range_start,
+	.invalidate_range       = mn_invalidate_range,
 	.invalidate_range_end   = mn_invalidate_range_end,
 };
 static void set_pri_tag_status(struct pasid_state *pasid_state,
@ -954,18 +917,10 @@ static int __init amd_iommu_v2_init(void)
 	if (iommu_wq == NULL)
 		goto out;
 	ret = -ENOMEM;
 	empty_page_table = (u64 *)get_zeroed_page(GFP_KERNEL);
 	if (empty_page_table == NULL)
 		goto out_destroy_wq;
 	amd_iommu_register_ppr_notifier(&ppr_nb);
 	return 0;
 out_destroy_wq:
 	destroy_workqueue(iommu_wq);
 out:
 	return ret;
 }
@ -999,8 +954,6 @@ static void __exit amd_iommu_v2_exit(void)
 	}
 	destroy_workqueue(iommu_wq);
 	free_page((unsigned long)empty_page_table);
 }
 module_init(amd_iommu_v2_init);
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@ -98,11 +98,11 @@ struct mmu_notifier_ops {
 	/*
 	 * invalidate_range_start() and invalidate_range_end() must be
 	 * paired and are called only when the mmap_sem and/or the
-	 * locks protecting the reverse maps are held. The subsystem
+	 * locks protecting the reverse maps are held. If the subsystem
-	 * must guarantee that no additional references are taken to
+	 * can't guarantee that no additional references are taken to
-	 * the pages in the range established between the call to
+	 * the pages in the range, it has to implement the
-	 * invalidate_range_start() and the matching call to
+	 * invalidate_range() notifier to remove any references taken
-	 * invalidate_range_end().
+	 * after invalidate_range_start().
 	 *
 	 * Invalidation of multiple concurrent ranges may be
 	 * optionally permitted by the driver. Either way the
@ -144,6 +144,29 @@ struct mmu_notifier_ops {
 	void (*invalidate_range_end)(struct mmu_notifier *mn,
 				     struct mm_struct *mm,
 				     unsigned long start, unsigned long end);
 	/*
 	 * invalidate_range() is either called between
 	 * invalidate_range_start() and invalidate_range_end() when the
 	 * VM has to free pages that where unmapped, but before the
 	 * pages are actually freed, or outside of _start()/_end() when
 	 * a (remote) TLB is necessary.
 	 *
 	 * If invalidate_range() is used to manage a non-CPU TLB with
 	 * shared page-tables, it not necessary to implement the
 	 * invalidate_range_start()/end() notifiers, as
 	 * invalidate_range() alread catches the points in time when an
 	 * external TLB range needs to be flushed.
 	 *
 	 * The invalidate_range() function is called under the ptl
 	 * spin-lock and not allowed to sleep.
 	 *
 	 * Note that this function might be called with just a sub-range
 	 * of what was passed to invalidate_range_start()/end(), if
 	 * called between those functions.
 	 */
 	void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
 				 unsigned long start, unsigned long end);
 };
 /*
@ -190,6 +213,8 @@ extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@ -242,6 +267,13 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 		__mmu_notifier_invalidate_range_end(mm, start, end);
 }
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
 	if (mm_has_notifiers(mm))
 		__mmu_notifier_invalidate_range(mm, start, end);
 }
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
 {
 	mm->mmu_notifier_mm = NULL;
@ -279,6 +311,44 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	__young;							\
 })
 #define	ptep_clear_flush_notify(__vma, __address, __ptep)		\
 ({									\
 	unsigned long ___addr = __address & PAGE_MASK;			\
 	struct mm_struct *___mm = (__vma)->vm_mm;			\
 	pte_t ___pte;							\
 									\
 	___pte = ptep_clear_flush(__vma, __address, __ptep);		\
 	mmu_notifier_invalidate_range(___mm, ___addr,			\
 					___addr + PAGE_SIZE);		\
 									\
 	___pte;								\
 })
 #define pmdp_clear_flush_notify(__vma, __haddr, __pmd)			\
 ({									\
 	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
 	struct mm_struct *___mm = (__vma)->vm_mm;			\
 	pmd_t ___pmd;							\
 									\
 	___pmd = pmdp_clear_flush(__vma, __haddr, __pmd);		\
 	mmu_notifier_invalidate_range(___mm, ___haddr,			\
 				      ___haddr + HPAGE_PMD_SIZE);	\
 									\
 	___pmd;								\
 })
 #define pmdp_get_and_clear_notify(__mm, __haddr, __pmd)			\
 ({									\
 	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
 	pmd_t ___pmd;							\
 									\
 	___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd);		\
 	mmu_notifier_invalidate_range(__mm, ___haddr,			\
 				      ___haddr + HPAGE_PMD_SIZE);	\
 									\
 	___pmd;								\
 })
 /*
 * set_pte_at_notify() sets the pte _after_ running the notifier.
 * This is safe to start by updating the secondary MMUs, because the primary MMU
@ -342,6 +412,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
 }
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
 }
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
 {
 }
@ -352,6 +427,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_clear_flush_notify pmdp_clear_flush
 #define pmdp_get_and_clear_notify pmdp_get_and_clear
 #define set_pte_at_notify set_pte_at
 #endif /* CONFIG_MMU_NOTIFIER */
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@ -0,0 +1,154 @@
 /*
 * Copyright 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef KFD_IOCTL_H_INCLUDED
 #define KFD_IOCTL_H_INCLUDED
 #include <linux/types.h>
 #include <linux/ioctl.h>
 #define KFD_IOCTL_MAJOR_VERSION 1
 #define KFD_IOCTL_MINOR_VERSION 0
 struct kfd_ioctl_get_version_args {
 	uint32_t major_version;	/* from KFD */
 	uint32_t minor_version;	/* from KFD */
 };
 /* For kfd_ioctl_create_queue_args.queue_type. */
 #define KFD_IOC_QUEUE_TYPE_COMPUTE	0
 #define KFD_IOC_QUEUE_TYPE_SDMA		1
 #define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL	2
 #define KFD_MAX_QUEUE_PERCENTAGE	100
 #define KFD_MAX_QUEUE_PRIORITY		15
 struct kfd_ioctl_create_queue_args {
 	uint64_t ring_base_address;	/* to KFD */
 	uint64_t write_pointer_address;	/* from KFD */
 	uint64_t read_pointer_address;	/* from KFD */
 	uint64_t doorbell_offset;	/* from KFD */
 	uint32_t ring_size;		/* to KFD */
 	uint32_t gpu_id;		/* to KFD */
 	uint32_t queue_type;		/* to KFD */
 	uint32_t queue_percentage;	/* to KFD */
 	uint32_t queue_priority;	/* to KFD */
 	uint32_t queue_id;		/* from KFD */
 	uint64_t eop_buffer_address;	/* to KFD */
 	uint64_t eop_buffer_size;	/* to KFD */
 	uint64_t ctx_save_restore_address; /* to KFD */
 	uint64_t ctx_save_restore_size;	/* to KFD */
 };
 struct kfd_ioctl_destroy_queue_args {
 	uint32_t queue_id;		/* to KFD */
 	uint32_t pad;
 };
 struct kfd_ioctl_update_queue_args {
 	uint64_t ring_base_address;	/* to KFD */
 	uint32_t queue_id;		/* to KFD */
 	uint32_t ring_size;		/* to KFD */
 	uint32_t queue_percentage;	/* to KFD */
 	uint32_t queue_priority;	/* to KFD */
 };
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
 struct kfd_ioctl_set_memory_policy_args {
 	uint64_t alternate_aperture_base;	/* to KFD */
 	uint64_t alternate_aperture_size;	/* to KFD */
 	uint32_t gpu_id;			/* to KFD */
 	uint32_t default_policy;		/* to KFD */
 	uint32_t alternate_policy;		/* to KFD */
 	uint32_t pad;
 };
 /*
 * All counters are monotonic. They are used for profiling of compute jobs.
 * The profiling is done by userspace.
 *
 * In case of GPU reset, the counter should not be affected.
 */
 struct kfd_ioctl_get_clock_counters_args {
 	uint64_t gpu_clock_counter;	/* from KFD */
 	uint64_t cpu_clock_counter;	/* from KFD */
 	uint64_t system_clock_counter;	/* from KFD */
 	uint64_t system_clock_freq;	/* from KFD */
 	uint32_t gpu_id;		/* to KFD */
 	uint32_t pad;
 };
 #define NUM_OF_SUPPORTED_GPUS 7
 struct kfd_process_device_apertures {
 	uint64_t lds_base;		/* from KFD */
 	uint64_t lds_limit;		/* from KFD */
 	uint64_t scratch_base;		/* from KFD */
 	uint64_t scratch_limit;		/* from KFD */
 	uint64_t gpuvm_base;		/* from KFD */
 	uint64_t gpuvm_limit;		/* from KFD */
 	uint32_t gpu_id;		/* from KFD */
 	uint32_t pad;
 };
 struct kfd_ioctl_get_process_apertures_args {
 	struct kfd_process_device_apertures
 			process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
 	/* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
 	uint32_t num_of_nodes;
 	uint32_t pad;
 };
 #define KFD_IOC_MAGIC 'K'
 #define KFD_IOC_GET_VERSION \
 		_IOR(KFD_IOC_MAGIC, 1, struct kfd_ioctl_get_version_args)
 #define KFD_IOC_CREATE_QUEUE \
 		_IOWR(KFD_IOC_MAGIC, 2, struct kfd_ioctl_create_queue_args)
 #define KFD_IOC_DESTROY_QUEUE \
 	_IOWR(KFD_IOC_MAGIC, 3, struct kfd_ioctl_destroy_queue_args)
 #define KFD_IOC_SET_MEMORY_POLICY \
 	_IOW(KFD_IOC_MAGIC, 4, struct kfd_ioctl_set_memory_policy_args)
 #define KFD_IOC_GET_CLOCK_COUNTERS \
 	_IOWR(KFD_IOC_MAGIC, 5, struct kfd_ioctl_get_clock_counters_args)
 #define KFD_IOC_GET_PROCESS_APERTURES \
 	_IOR(KFD_IOC_MAGIC, 6, struct kfd_ioctl_get_process_apertures_args)
 #define KFD_IOC_UPDATE_QUEUE \
 	_IOW(KFD_IOC_MAGIC, 7, struct kfd_ioctl_update_queue_args)
 #endif
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	}
 	flush_cache_page(vma, addr, pte_pfn(*ptep));
-	ptep_clear_flush(vma, addr, ptep);
+	ptep_clear_flush_notify(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 	page_remove_rmap(page);
--- a/mm/fremap.c
+++ b/mm/fremap.c
@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (pte_present(pte)) {
 		flush_cache_page(vma, addr, pte_pfn(pte));
-		pte = ptep_clear_flush(vma, addr, ptep);
+		pte = ptep_clear_flush_notify(vma, addr, ptep);
 		page = vm_normal_page(vma, addr, pte);
 		if (page) {
 			if (pte_dirty(pte))
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@ -1036,7 +1036,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 		goto out_free_pages;
 	VM_BUG_ON_PAGE(!PageHead(page), page);
-	pmdp_clear_flush(vma, haddr, pmd);
+	pmdp_clear_flush_notify(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@ -1179,7 +1179,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pmd_t entry;
 		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-		pmdp_clear_flush(vma, haddr, pmd);
+		pmdp_clear_flush_notify(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		mem_cgroup_commit_charge(new_page, memcg, false);
 		lru_cache_add_active_or_unevictable(new_page, vma);
@ -1512,7 +1512,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		pmd_t entry;
 		ret = 1;
 		if (!prot_numa) {
-			entry = pmdp_get_and_clear(mm, addr, pmd);
+			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
 			if (pmd_numa(entry))
 				entry = pmd_mknonnuma(entry);
 			entry = pmd_modify(entry, newprot);
@ -1644,6 +1644,7 @@ static int __split_huge_page_splitting(struct page *page,
 		 * serialize against split_huge_page*.
 		 */
 		pmdp_splitting_flush(vma, address, pmd);
 		ret = 1;
 		spin_unlock(ptl);
 	}
@ -2834,7 +2835,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	pmd_t _pmd;
 	int i;
-	pmdp_clear_flush(vma, haddr, pmd);
+	pmdp_clear_flush_notify(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@ -2598,8 +2598,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			}
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		} else {
-			if (cow)
+			if (cow) {
 				huge_ptep_set_wrprotect(src, addr, src_pte);
 				mmu_notifier_invalidate_range(src, mmun_start,
 								   mmun_end);
 			}
 			entry = huge_ptep_get(src_pte);
 			ptepage = pte_page(entry);
 			get_page(ptepage);
@ -2899,6 +2902,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Break COW */
 		huge_ptep_clear_flush(vma, address, ptep);
 		mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
 		set_huge_pte_at(mm, address, ptep,
 				make_huge_pte(vma, new_page, 1));
 		page_remove_rmap(old_page);
@ -3374,6 +3378,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * and that page table be reused and filled with junk.
 	 */
 	flush_tlb_range(vma, start, end);
 	mmu_notifier_invalidate_range(mm, start, end);
 	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 	mmu_notifier_invalidate_range_end(mm, start, end);
--- a/mm/ksm.c
+++ b/mm/ksm.c
@ -892,7 +892,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		 * this assure us that no O_DIRECT can happen after the check
 		 * or in the middle of the check.
 		 */
-		entry = ptep_clear_flush(vma, addr, ptep);
+		entry = ptep_clear_flush_notify(vma, addr, ptep);
 		/*
 		 * Check that no O_DIRECT or similar I/O is in progress on the
 		 * page
@ -960,7 +960,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	page_add_anon_rmap(kpage, vma, addr);
 	flush_cache_page(vma, addr, pte_pfn(*ptep));
-	ptep_clear_flush(vma, addr, ptep);
+	ptep_clear_flush_notify(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 	page_remove_rmap(page);
--- a/mm/memory.c
+++ b/mm/memory.c
@ -238,6 +238,7 @@ static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
 	tlb->need_flush = 0;
 	tlb_flush(tlb);
 	mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb_table_flush(tlb);
 #endif
@ -2234,7 +2235,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
-		ptep_clear_flush(vma, address, page_table);
+		ptep_clear_flush_notify(vma, address, page_table);
 		page_add_new_anon_rmap(new_page, vma, address);
 		mem_cgroup_commit_charge(new_page, memcg, false);
 		lru_cache_add_active_or_unevictable(new_page, vma);
--- a/mm/migrate.c
+++ b/mm/migrate.c
@ -1854,7 +1854,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	 */
 	flush_cache_range(vma, mmun_start, mmun_end);
 	page_add_anon_rmap(new_page, vma, mmun_start);
-	pmdp_clear_flush(vma, mmun_start, pmd);
+	pmdp_clear_flush_notify(vma, mmun_start, pmd);
 	set_pmd_at(mm, mmun_start, pmd, entry);
 	flush_tlb_range(vma, mmun_start, mmun_end);
 	update_mmu_cache_pmd(vma, address, &entry);
@ -1862,6 +1862,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	if (page_count(page) != 2) {
 		set_pmd_at(mm, mmun_start, pmd, orig_entry);
 		flush_tlb_range(vma, mmun_start, mmun_end);
 		mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
 		update_mmu_cache_pmd(vma, address, &entry);
 		page_remove_rmap(new_page);
 		goto fail_putback;
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@ -193,6 +193,16 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
 		/*
 		 * Call invalidate_range here too to avoid the need for the
 		 * subsystem of having to register an invalidate_range_end
 		 * call-back when there is invalidate_range already. Usually a
 		 * subsystem registers either invalidate_range_start()/end() or
 		 * invalidate_range(), so this will be no additional overhead
 		 * (besides the pointer check).
 		 */
 		if (mn->ops->invalidate_range)
 			mn->ops->invalidate_range(mn, mm, start, end);
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
 	}
@ -200,6 +210,21 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
 void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
 	struct mmu_notifier *mn;
 	int id;
 	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range)
 			mn->ops->invalidate_range(mn, mm, start, end);
 	}
 	srcu_read_unlock(&srcu, id);
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
 				    struct mm_struct *mm,
 				    int take_mmap_sem)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@ -1378,7 +1378,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush(vma, address, pte);
+		pteval = ptep_clear_flush_notify(vma, address, pte);
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address)) {