This tag contains the following changes for kernel 5.10-rc1:

- Stop using the DRM's dma-fence module and instead use kernel completions. - Support PCIe AER - Use dma_mmap_coherent for memory allocated using dma_alloc_coherent - Use smallest possible alignment when allocating virtual addresses in our MMU driver. - Refactor MMU driver code to be device-oriented - Allow user to check CS status without any sleep - Add an option to map a Command Buffer to the Device's MMU - Expose sync manager resource allocation to user through INFO IOCTL - Convert code to use standard BIT(), GENMASK() and FIELD_PREP() - Many small fixes (casting, better error messages, remove unused defines, h/w configuration fixes, etc.) -----BEGIN PGP SIGNATURE----- iQFKBAABCgA0FiEE7TEboABC71LctBLFZR1NuKta54AFAl9qIHgWHG9kZWQuZ2Fi YmF5QGdtYWlsLmNvbQAKCRBlHU24q1rngKKmB/9EzD+xOFKFM7NOWdbkmJQ8di7P eLHlqSh8EpIKyvVtiNp5oMMUwBM7bBK4gsvHqPK4cQNUN6wvqvDYt9lRhjgSua1A 8heVtWpot0+d8PVT64PDWYxRfkI4UiOhQDpyj2vhrBkepW9cQlxs/DTlJHfDAQRd ihY3H94tX5DO5/wy7W9XC5BChvgMMoj9KIP5+wYdReaPbgQyIx9x7L8GbqE1aS9C daHmFGXxSfJffxDGkIot3XczrmoBIx+9qtL7EZo2HkC6s1IyBnX1KgxvAJR5urt8 FFd0ma3Md+8PLkEeNX3VJrDAnQPskCvmrU2B61PftnI0EC3eW7mRufM0+kiM =zQ26 -----END PGP SIGNATURE----- Merge tag 'misc-habanalabs-next-2020-09-22' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next Oded writes: This tag contains the following changes for kernel 5.10-rc1: - Stop using the DRM's dma-fence module and instead use kernel completions. - Support PCIe AER - Use dma_mmap_coherent for memory allocated using dma_alloc_coherent - Use smallest possible alignment when allocating virtual addresses in our MMU driver. - Refactor MMU driver code to be device-oriented - Allow user to check CS status without any sleep - Add an option to map a Command Buffer to the Device's MMU - Expose sync manager resource allocation to user through INFO IOCTL - Convert code to use standard BIT(), GENMASK() and FIELD_PREP() - Many small fixes (casting, better error messages, remove unused defines, h/w configuration fixes, etc.) * tag 'misc-habanalabs-next-2020-09-22' of git://people.freedesktop.org/~gabbayo/linux: (46 commits) habanalabs: update scratchpad register map habanalabs: add indication of security-enabled F/W habanalabs/gaudi: fix DMA completions max outstanding to 15 habanalabs/gaudi: remove axi drain support habanalabs: update firmware interface file habanalabs: Add an option to map CB to device MMU habanalabs: Save context in a command buffer object habanalabs: no need for DMA_SHARED_BUFFER habanalabs: allow to wait on CS without sleep habanalabs/gaudi: increase timeout for boot fit load habanalabs: add debugfs support for MMU with 6 HOPs habanalabs: add num_hops to hl_mmu_properties habanalabs: refactor MMU as device-oriented habanalabs: rename mmu.c to mmu_v1.c habanalabs: use smallest possible alignment for virtual addresses habanalabs: check flag before reset because of f/w event habanalabs: increase PQ COMP_OFFSET by one nibble habanalabs: Fix alignment issue in cpucp_info structure habanalabs: remove unused define habanalabs: remove unused ASIC function pointer ...
2024-11-24 10:20:49 +07:00 · 2020-09-22 18:38:09 +02:00 · 2020-09-22 18:38:09 +02:00 · 9e07279310
commit 9e07279310
parent e82ed736ad f279e5cd95
33 changed files with 8652 additions and 7694 deletions
--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@ -2,13 +2,17 @@ What:           /sys/class/habanalabs/hl<n>/armcp_kernel_ver
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Version of the Linux kernel running on the device's CPU
+Description:    Version of the Linux kernel running on the device's CPU.
+                Will be DEPRECATED in Linux kernel version 5.10, and be
+                replaced with cpucp_kernel_ver

 What:           /sys/class/habanalabs/hl<n>/armcp_ver
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
 Description:    Version of the application running on the device's CPU
+                Will be DEPRECATED in Linux kernel version 5.10, and be
+                replaced with cpucp_ver

 What:           /sys/class/habanalabs/hl<n>/clk_max_freq_mhz
 Date:           Jun 2019
@ -33,6 +37,18 @@ KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
 Description:    Version of the Device's CPLD F/W

+What:           /sys/class/habanalabs/hl<n>/cpucp_kernel_ver
+Date:           Oct 2020
+KernelVersion:  5.10
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the Linux kernel running on the device's CPU
+
+What:           /sys/class/habanalabs/hl<n>/cpucp_ver
+Date:           Oct 2020
+KernelVersion:  5.10
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the application running on the device's CPU
+
 What:           /sys/class/habanalabs/hl<n>/device_type
 Date:           Jan 2019
 KernelVersion:  5.1
--- a/drivers/misc/habanalabs/Kconfig
+++ b/drivers/misc/habanalabs/Kconfig
@ -7,7 +7,6 @@ config HABANA_AI
 	tristate "HabanaAI accelerators (habanalabs)"
 	depends on PCI && HAS_IOMEM
 	select FRAME_VECTOR
-	select DMA_SHARED_BUFFER
 	select GENERIC_ALLOCATOR
 	select HWMON
 	help
--- a/drivers/misc/habanalabs/common/Makefile
+++ b/drivers/misc/habanalabs/common/Makefile
@ -3,5 +3,5 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
 		common/asid.o common/habanalabs_ioctl.o \
 		common/command_buffer.o common/hw_queue.o common/irq.o \
 		common/sysfs.o common/hwmon.o common/memory.o \
-		common/command_submission.o common/mmu.o common/firmware_if.o \
-		common/pci.o
+		common/command_submission.o common/mmu.o common/mmu_v1.o \
+		common/firmware_if.o common/pci.o
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@ -13,6 +13,131 @@
 #include <linux/uaccess.h>
 #include <linux/genalloc.h>

+static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_vm_va_block *va_block, *tmp;
+	dma_addr_t bus_addr;
+	u64 virt_addr;
+	u32 page_size = prop->pmmu.page_size;
+	s32 offset;
+	int rc;
+
+	if (!hdev->supports_cb_mapping) {
+		dev_err_ratelimited(hdev->dev,
+				"Cannot map CB because no VA range is allocated for CB mapping\n");
+		return -EINVAL;
+	}
+
+	if (!hdev->mmu_enable) {
+		dev_err_ratelimited(hdev->dev,
+				"Cannot map CB because MMU is disabled\n");
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&cb->va_block_list);
+
+	for (bus_addr = cb->bus_address;
+			bus_addr < cb->bus_address + cb->size;
+			bus_addr += page_size) {
+
+		virt_addr = (u64) gen_pool_alloc(ctx->cb_va_pool, page_size);
+		if (!virt_addr) {
+			dev_err(hdev->dev,
+				"Failed to allocate device virtual address for CB\n");
+			rc = -ENOMEM;
+			goto err_va_pool_free;
+		}
+
+		va_block = kzalloc(sizeof(*va_block), GFP_KERNEL);
+		if (!va_block) {
+			rc = -ENOMEM;
+			gen_pool_free(ctx->cb_va_pool, virt_addr, page_size);
+			goto err_va_pool_free;
+		}
+
+		va_block->start = virt_addr;
+		va_block->end = virt_addr + page_size;
+		va_block->size = page_size;
+		list_add_tail(&va_block->node, &cb->va_block_list);
+	}
+
+	mutex_lock(&ctx->mmu_lock);
+
+	bus_addr = cb->bus_address;
+	offset = 0;
+	list_for_each_entry(va_block, &cb->va_block_list, node) {
+		rc = hl_mmu_map(ctx, va_block->start, bus_addr, va_block->size,
+				list_is_last(&va_block->node,
+						&cb->va_block_list));
+		if (rc) {
+			dev_err(hdev->dev, "Failed to map VA %#llx to CB\n",
+				va_block->start);
+			goto err_va_umap;
+		}
+
+		bus_addr += va_block->size;
+		offset += va_block->size;
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	cb->is_mmu_mapped = true;
+
+	return 0;
+
+err_va_umap:
+	list_for_each_entry(va_block, &cb->va_block_list, node) {
+		if (offset <= 0)
+			break;
+		hl_mmu_unmap(ctx, va_block->start, va_block->size,
+				offset <= va_block->size);
+		offset -= va_block->size;
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+err_va_pool_free:
+	list_for_each_entry_safe(va_block, tmp, &cb->va_block_list, node) {
+		gen_pool_free(ctx->cb_va_pool, va_block->start, va_block->size);
+		list_del(&va_block->node);
+		kfree(va_block);
+	}
+
+	return rc;
+}
+
+static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm_va_block *va_block, *tmp;
+
+	mutex_lock(&ctx->mmu_lock);
+
+	list_for_each_entry(va_block, &cb->va_block_list, node)
+		if (hl_mmu_unmap(ctx, va_block->start, va_block->size,
+				list_is_last(&va_block->node,
+						&cb->va_block_list)))
+			dev_warn_ratelimited(hdev->dev,
+					"Failed to unmap CB's va 0x%llx\n",
+					va_block->start);
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	list_for_each_entry_safe(va_block, tmp, &cb->va_block_list, node) {
+		gen_pool_free(ctx->cb_va_pool, va_block->start, va_block->size);
+		list_del(&va_block->node);
+		kfree(va_block);
+	}
+}
+
 static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
 {
 	if (cb->is_internal)
@ -47,6 +172,11 @@ static void cb_release(struct kref *ref)

 	hl_debugfs_remove_cb(cb);

+	if (cb->is_mmu_mapped)
+		cb_unmap_mem(cb->ctx, cb);
+
+	hl_ctx_put(cb->ctx);
+
 	cb_do_release(hdev, cb);
 }

@ -107,11 +237,12 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
 }

 int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
-			u32 cb_size, u64 *handle, int ctx_id, bool internal_cb)
+			struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
+			bool map_cb, u64 *handle)
 {
 	struct hl_cb *cb;
 	bool alloc_new_cb = true;
-	int rc;
+	int rc, ctx_id = ctx->asid;

 	/*
 	 * Can't use generic function to check this because of special case
@ -163,7 +294,21 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 	}

 	cb->hdev = hdev;
-	cb->ctx_id = ctx_id;
+	cb->ctx = ctx;
+	hl_ctx_get(hdev, cb->ctx);
+
+	if (map_cb) {
+		if (ctx_id == HL_KERNEL_ASID_ID) {
+			dev_err(hdev->dev,
+				"CB mapping is not supported for kernel context\n");
+			rc = -EINVAL;
+			goto release_cb;
+		}
+
+		rc = cb_map_mem(ctx, cb);
+		if (rc)
+			goto release_cb;
+	}

 	spin_lock(&mgr->cb_lock);
 	rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC);
@ -171,10 +316,10 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,

 	if (rc < 0) {
 		dev_err(hdev->dev, "Failed to allocate IDR for a new CB\n");
-		goto release_cb;
+		goto unmap_mem;
 	}

-	cb->id = rc;
+	cb->id = (u64) rc;

 	kref_init(&cb->refcount);
 	spin_lock_init(&cb->lock);
@ -183,14 +328,18 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 	 * idr is 32-bit so we can safely OR it with a mask that is above
 	 * 32 bit
 	 */
-	*handle = cb->id | HL_MMAP_CB_MASK;
+	*handle = cb->id | HL_MMAP_TYPE_CB;
 	*handle <<= PAGE_SHIFT;

 	hl_debugfs_add_cb(cb);

 	return 0;

+unmap_mem:
+	if (cb->is_mmu_mapped)
+		cb_unmap_mem(cb->ctx, cb);
 release_cb:
+	hl_ctx_put(cb->ctx);
 	cb_do_release(hdev, cb);
 out_err:
 	*handle = 0;
@ -250,9 +399,10 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 				args->in.cb_size, HL_MAX_CB_SIZE);
 			rc = -EINVAL;
 		} else {
-			rc = hl_cb_create(hdev, &hpriv->cb_mgr,
-					args->in.cb_size, &handle,
-					hpriv->ctx->asid, false);
+			rc = hl_cb_create(hdev, &hpriv->cb_mgr, hpriv->ctx,
+					args->in.cb_size, false,
+					!!(args->in.flags & HL_CB_FLAGS_MAP),
+					&handle);
 		}

 		memset(args, 0, sizeof(*args));
@ -300,11 +450,14 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_cb *cb;
-	phys_addr_t address;
 	u32 handle, user_cb_size;
 	int rc;

+	/* We use the page offset to hold the idr and thus we need to clear
+	 * it before doing the mmap itself
+	 */
 	handle = vma->vm_pgoff;
+	vma->vm_pgoff = 0;

 	/* reference was taken here */
 	cb = hl_cb_get(hdev, &hpriv->cb_mgr, handle);
@ -356,12 +509,8 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)

 	vma->vm_private_data = cb;

-	/* Calculate address for CB */
-	address = virt_to_phys((void *) (uintptr_t) cb->kernel_address);
-
-	rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address,
-					address, cb->size);
-
+	rc = hdev->asic_funcs->cb_mmap(hdev, vma, (void *) cb->kernel_address,
+					cb->bus_address, cb->size);
 	if (rc) {
 		spin_lock(&cb->lock);
 		cb->mmap = false;
@ -425,7 +574,7 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
 		if (kref_put(&cb->refcount, cb_release) != 1)
 			dev_err(hdev->dev,
 				"CB %d for CTX ID %d is still alive\n",
-				id, cb->ctx_id);
+				id, cb->ctx->asid);
 	}

 	idr_destroy(&mgr->cb_handles);
@ -438,8 +587,8 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 	struct hl_cb *cb;
 	int rc;

-	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
-			HL_KERNEL_ASID_ID, internal_cb);
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx, cb_size,
+				internal_cb, false, &cb_handle);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to allocate CB for the kernel driver %d\n", rc);
@ -495,3 +644,45 @@ int hl_cb_pool_fini(struct hl_device *hdev)

 	return 0;
 }
+
+int hl_cb_va_pool_init(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	if (!hdev->supports_cb_mapping)
+		return 0;
+
+	ctx->cb_va_pool = gen_pool_create(__ffs(prop->pmmu.page_size), -1);
+	if (!ctx->cb_va_pool) {
+		dev_err(hdev->dev,
+			"Failed to create VA gen pool for CB mapping\n");
+		return -ENOMEM;
+	}
+
+	rc = gen_pool_add(ctx->cb_va_pool, prop->cb_va_start_addr,
+			prop->cb_va_end_addr - prop->cb_va_start_addr, -1);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to add memory to VA gen pool for CB mapping\n");
+		goto err_pool_destroy;
+	}
+
+	return 0;
+
+err_pool_destroy:
+	gen_pool_destroy(ctx->cb_va_pool);
+
+	return rc;
+}
+
+void hl_cb_va_pool_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	if (!hdev->supports_cb_mapping)
+		return;
+
+	gen_pool_destroy(ctx->cb_va_pool);
+}
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@ -38,26 +38,10 @@ void hl_sob_reset_error(struct kref *ref)
 			hw_sob->q_idx, hw_sob->sob_id);
 }

-static const char *hl_fence_get_driver_name(struct dma_fence *fence)
-{
-	return "HabanaLabs";
-}
-
-static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
-{
-	struct hl_cs_compl *hl_cs_compl =
-		container_of(fence, struct hl_cs_compl, base_fence);
-
-	return dev_name(hl_cs_compl->hdev->dev);
-}
-
-static bool hl_fence_enable_signaling(struct dma_fence *fence)
-{
-	return true;
-}
-
-static void hl_fence_release(struct dma_fence *fence)
+static void hl_fence_release(struct kref *kref)
 {
+	struct hl_fence *fence =
+		container_of(kref, struct hl_fence, refcount);
 	struct hl_cs_compl *hl_cs_cmpl =
 		container_of(fence, struct hl_cs_compl, base_fence);
 	struct hl_device *hdev = hl_cs_cmpl->hdev;
@ -99,15 +83,27 @@ static void hl_fence_release(struct dma_fence *fence)
 	}

 free:
-	kfree_rcu(hl_cs_cmpl, base_fence.rcu);
+	kfree(hl_cs_cmpl);
 }

-static const struct dma_fence_ops hl_fence_ops = {
-	.get_driver_name = hl_fence_get_driver_name,
-	.get_timeline_name = hl_fence_get_timeline_name,
-	.enable_signaling = hl_fence_enable_signaling,
-	.release = hl_fence_release
-};
+void hl_fence_put(struct hl_fence *fence)
+{
+	if (fence)
+		kref_put(&fence->refcount, hl_fence_release);
+}
+
+void hl_fence_get(struct hl_fence *fence)
+{
+	if (fence)
+		kref_get(&fence->refcount);
+}
+
+static void hl_fence_init(struct hl_fence *fence)
+{
+	kref_init(&fence->refcount);
+	fence->error = 0;
+	init_completion(&fence->completion);
+}

 static void cs_get(struct hl_cs *cs)
 {
@ -256,6 +252,8 @@ static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx)
 			ctx->cs_counters.parsing_drop_cnt;
 	hdev->aggregated_cs_counters.queue_full_drop_cnt +=
 			ctx->cs_counters.queue_full_drop_cnt;
+	hdev->aggregated_cs_counters.max_cs_in_flight_drop_cnt +=
+			ctx->cs_counters.max_cs_in_flight_drop_cnt;
 }

 static void cs_do_release(struct kref *ref)
@ -336,7 +334,7 @@ static void cs_do_release(struct kref *ref)
 		 * In case the wait for signal CS was submitted, the put occurs
 		 * in init_signal_wait_cs() right before hanging on the PQ.
 		 */
-		dma_fence_put(cs->signal_fence);
+		hl_fence_put(cs->signal_fence);
 	}

 	/*
@ -348,19 +346,18 @@ static void cs_do_release(struct kref *ref)
 	hl_ctx_put(cs->ctx);

 	/* We need to mark an error for not submitted because in that case
-	 * the dma fence release flow is different. Mainly, we don't need
+	 * the hl fence release flow is different. Mainly, we don't need
 	 * to handle hw_sob for signal/wait
 	 */
 	if (cs->timedout)
-		dma_fence_set_error(cs->fence, -ETIMEDOUT);
+		cs->fence->error = -ETIMEDOUT;
 	else if (cs->aborted)
-		dma_fence_set_error(cs->fence, -EIO);
+		cs->fence->error = -EIO;
 	else if (!cs->submitted)
-		dma_fence_set_error(cs->fence, -EBUSY);
-
-	dma_fence_signal(cs->fence);
-	dma_fence_put(cs->fence);
+		cs->fence->error = -EBUSY;

+	complete_all(&cs->fence->completion);
+	hl_fence_put(cs->fence);
 	cs_counters_aggregate(hdev, cs->ctx);

 	kfree(cs->jobs_in_queue_cnt);
@ -401,7 +398,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 			enum hl_cs_type cs_type, struct hl_cs **cs_new)
 {
 	struct hl_cs_compl *cs_cmpl;
-	struct dma_fence *other = NULL;
+	struct hl_fence *other = NULL;
 	struct hl_cs *cs;
 	int rc;

@ -434,9 +431,11 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs_cmpl->cs_seq = ctx->cs_sequence;
 	other = ctx->cs_pending[cs_cmpl->cs_seq &
 				(hdev->asic_prop.max_pending_cs - 1)];
-	if ((other) && (!dma_fence_is_signaled(other))) {
-		dev_dbg(hdev->dev,
+
+	if (other && !completion_done(&other->completion)) {
+		dev_dbg_ratelimited(hdev->dev,
 			"Rejecting CS because of too many in-flights CS\n");
+		ctx->cs_counters.max_cs_in_flight_drop_cnt++;
 		rc = -EAGAIN;
 		goto free_fence;
 	}
@ -448,8 +447,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 		goto free_fence;
 	}

-	dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
-			ctx->asid, ctx->cs_sequence);
+	/* init hl_fence */
+	hl_fence_init(&cs_cmpl->base_fence);

 	cs->sequence = cs_cmpl->cs_seq;

@ -458,9 +457,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 							&cs_cmpl->base_fence;
 	ctx->cs_sequence++;

-	dma_fence_get(&cs_cmpl->base_fence);
+	hl_fence_get(&cs_cmpl->base_fence);

-	dma_fence_put(other);
+	hl_fence_put(other);

 	spin_unlock(&ctx->cs_lock);

@ -690,8 +689,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 			rc = -ENOMEM;
 			if (is_kernel_allocated_cb)
 				goto release_cb;
-			else
-				goto free_cs_object;
+
+			goto free_cs_object;
 		}

 		job->id = i + 1;
@ -773,7 +772,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	struct hl_ctx *ctx = hpriv->ctx;
 	struct hl_cs_chunk *cs_chunk_array, *chunk;
 	struct hw_queue_properties *hw_queue_prop;
-	struct dma_fence *sig_fence = NULL;
+	struct hl_fence *sig_fence = NULL;
 	struct hl_cs_job *job;
 	struct hl_cs *cs;
 	struct hl_cb *cb;
@ -883,14 +882,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 			dev_err(hdev->dev,
 				"CS seq 0x%llx is not of a signal CS\n",
 				signal_seq);
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 			rc = -EINVAL;
 			goto free_signal_seq_array;
 		}

-		if (dma_fence_is_signaled(sig_fence)) {
+		if (completion_done(&sig_fence->completion)) {
 			/* signal CS already finished */
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 			rc = 0;
 			goto free_signal_seq_array;
 		}
@ -902,7 +901,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	rc = allocate_cs(hdev, ctx, cs_type, &cs);
 	if (rc) {
 		if (cs_type == CS_TYPE_WAIT)
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 		hl_ctx_put(ctx);
 		goto free_signal_seq_array;
 	}
@ -1162,7 +1161,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 		struct hl_ctx *ctx, u64 timeout_us, u64 seq)
 {
-	struct dma_fence *fence;
+	struct hl_fence *fence;
 	unsigned long timeout;
 	long rc;

@ -1181,12 +1180,18 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 				"Can't wait on CS %llu because current CS is at seq %llu\n",
 				seq, ctx->cs_sequence);
 	} else if (fence) {
-		rc = dma_fence_wait_timeout(fence, true, timeout);
+		if (!timeout_us)
+			rc = completion_done(&fence->completion);
+		else
+			rc = wait_for_completion_interruptible_timeout(
+					&fence->completion, timeout);
+
 		if (fence->error == -ETIMEDOUT)
 			rc = -ETIMEDOUT;
 		else if (fence->error == -EIO)
 			rc = -EIO;
-		dma_fence_put(fence);
+
+		hl_fence_put(fence);
 	} else {
 		dev_dbg(hdev->dev,
 			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@ -23,7 +23,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 	 */

 	for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
-		dma_fence_put(ctx->cs_pending[i]);
+		hl_fence_put(ctx->cs_pending[i]);

 	kfree(ctx->cs_pending);

@ -37,6 +37,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
 			hl_device_set_debug_mode(hdev, false);

+		hl_cb_va_pool_fini(ctx);
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
 	} else {
@ -128,7 +129,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 	atomic_set(&ctx->thread_ctx_switch_token, 1);
 	ctx->thread_ctx_switch_wait_token = 0;
 	ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
-				sizeof(struct dma_fence *),
+				sizeof(struct hl_fence *),
 				GFP_KERNEL);
 	if (!ctx->cs_pending)
 		return -ENOMEM;
@ -155,15 +156,24 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 			goto err_asid_free;
 		}

+		rc = hl_cb_va_pool_init(ctx);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to init VA pool for mapped CB\n");
+			goto err_vm_ctx_fini;
+		}
+
 		rc = hdev->asic_funcs->ctx_init(ctx);
 		if (rc) {
 			dev_err(hdev->dev, "ctx_init failed\n");
-			goto err_vm_ctx_fini;
+			goto err_cb_va_pool_fini;
 		}
 	}

 	return 0;

+err_cb_va_pool_fini:
+	hl_cb_va_pool_fini(ctx);
 err_vm_ctx_fini:
 	hl_vm_ctx_fini(ctx);
 err_asid_free:
@ -184,10 +194,10 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }

-struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 {
 	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
-	struct dma_fence *fence;
+	struct hl_fence *fence;

 	spin_lock(&ctx->cs_lock);

@ -201,8 +211,9 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 		return NULL;
 	}

-	fence = dma_fence_get(
-			ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
+	fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)];
+	hl_fence_get(fence);
+
 	spin_unlock(&ctx->cs_lock);

 	return fence;
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@ -21,7 +21,7 @@ static struct dentry *hl_debug_root;
 static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 				u8 i2c_reg, long *val)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	if (hl_device_disabled_or_in_reset(hdev))
@ -29,8 +29,8 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_RD <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_RD <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
@ -47,7 +47,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 				u8 i2c_reg, u32 val)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	if (hl_device_disabled_or_in_reset(hdev))
@ -55,8 +55,8 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_WR <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_WR <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
@ -73,7 +73,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,

 static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	if (hl_device_disabled_or_in_reset(hdev))
@ -81,8 +81,8 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_LED_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_LED_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.led_index = cpu_to_le32(led);
 	pkt.value = cpu_to_le64(state);

@ -110,8 +110,8 @@ static int command_buffers_show(struct seq_file *s, void *data)
 			seq_puts(s, "---------------------------------------------------------------\n");
 		}
 		seq_printf(s,
-			"   %03d        %d    0x%08x      %d          %d          %d\n",
-			cb->id, cb->ctx_id, cb->size,
+			"   %03llu        %d    0x%08x      %d          %d          %d\n",
+			cb->id, cb->ctx->asid, cb->size,
 			kref_read(&cb->refcount),
 			cb->mmap, cb->cs_cnt);
 	}
@ -354,6 +354,14 @@ static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
 					mmu_specs->hop4_shift);
 }

+static inline u64 get_hop5_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop5_mask,
+					mmu_specs->hop5_shift);
+}
+
 static inline u64 get_next_hop_addr(u64 curr_pte)
 {
 	if (curr_pte & PAGE_PRESENT_MASK)
@ -377,6 +385,7 @@ static int mmu_show(struct seq_file *s, void *data)
 		hop2_addr = 0, hop2_pte_addr = 0, hop2_pte = 0,
 		hop3_addr = 0, hop3_pte_addr = 0, hop3_pte = 0,
 		hop4_addr = 0, hop4_pte_addr = 0, hop4_pte = 0,
+		hop5_addr = 0, hop5_pte_addr = 0, hop5_pte = 0,
 		virt_addr = dev_entry->mmu_addr;

 	if (!hdev->mmu_enable)
@ -428,20 +437,49 @@ static int mmu_show(struct seq_file *s, void *data)
 	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
 	hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);

-	if (!(hop3_pte & LAST_MASK)) {
+	if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
+		if (!(hop3_pte & LAST_MASK)) {
+			hop4_addr = get_next_hop_addr(hop3_pte);
+
+			if (hop4_addr == ULLONG_MAX)
+				goto not_mapped;
+
+			hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
+							hop4_addr, virt_addr);
+			hop4_pte = hdev->asic_funcs->read_pte(hdev,
+								hop4_pte_addr);
+			if (!(hop4_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		} else {
+			if (!(hop3_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		}
+	} else {
 		hop4_addr = get_next_hop_addr(hop3_pte);

 		if (hop4_addr == ULLONG_MAX)
 			goto not_mapped;

-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-		hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
-		if (!(hop4_pte & PAGE_PRESENT_MASK))
-			goto not_mapped;
-	} else {
-		if (!(hop3_pte & PAGE_PRESENT_MASK))
-			goto not_mapped;
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
+						hop4_addr, virt_addr);
+		hop4_pte = hdev->asic_funcs->read_pte(hdev,
+							hop4_pte_addr);
+		if (!(hop4_pte & LAST_MASK)) {
+			hop5_addr = get_next_hop_addr(hop4_pte);
+
+			if (hop5_addr == ULLONG_MAX)
+				goto not_mapped;
+
+			hop5_pte_addr = get_hop5_pte_addr(ctx, mmu_prop,
+							hop5_addr, virt_addr);
+			hop5_pte = hdev->asic_funcs->read_pte(hdev,
+								hop5_pte_addr);
+			if (!(hop5_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		} else {
+			if (!(hop4_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		}
 	}

 	seq_printf(s, "asid: %u, virt_addr: 0x%llx\n",
@ -463,10 +501,22 @@ static int mmu_show(struct seq_file *s, void *data)
 	seq_printf(s, "hop3_pte_addr: 0x%llx\n", hop3_pte_addr);
 	seq_printf(s, "hop3_pte: 0x%llx\n", hop3_pte);

-	if (!(hop3_pte & LAST_MASK)) {
+	if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
+		if (!(hop3_pte & LAST_MASK)) {
+			seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
+			seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
+			seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
+		}
+	} else {
 		seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
 		seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
 		seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
+
+		if (!(hop4_pte & LAST_MASK)) {
+			seq_printf(s, "hop5_addr: 0x%llx\n", hop5_addr);
+			seq_printf(s, "hop5_pte_addr: 0x%llx\n", hop5_pte_addr);
+			seq_printf(s, "hop5_pte: 0x%llx\n", hop5_pte);
+		}
 	}

 	goto out;
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@ -123,9 +123,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
 static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct hl_fpriv *hpriv = filp->private_data;
+	unsigned long vm_pgoff;

-	if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
-		vma->vm_pgoff ^= HL_MMAP_CB_MASK;
+	vm_pgoff = vma->vm_pgoff;
+	vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
+
+	switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
+	case HL_MMAP_TYPE_CB:
 		return hl_cb_mmap(hpriv, vma);
 	}

@ -286,7 +290,7 @@ static int device_early_init(struct hl_device *hdev)
 	}

 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
-		snprintf(workq_name, 32, "hl-free-jobs-%u", i);
+		snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
 		hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
 		if (hdev->cq_wq[i] == NULL) {
 			dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
@ -317,6 +321,10 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_chip_info;
 	}

+	rc = hl_mmu_if_set_funcs(hdev);
+	if (rc)
+		goto free_idle_busy_ts_arr;
+
 	hl_cb_mgr_init(&hdev->kernel_cb_mgr);

 	mutex_init(&hdev->send_cpu_message_lock);
@ -330,6 +338,8 @@ static int device_early_init(struct hl_device *hdev)

 	return 0;

+free_idle_busy_ts_arr:
+	kfree(hdev->idle_busy_ts_arr);
 free_chip_info:
 	kfree(hdev->hl_chip_info);
 free_eq_wq:
@ -871,7 +881,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 			 * so this message won't be sent
 			 */
 			if (hl_fw_send_pci_access_msg(hdev,
-					ARMCP_PACKET_DISABLE_PCI_ACCESS))
+					CPUCP_PACKET_DISABLE_PCI_ACCESS))
 				dev_warn(hdev->dev,
 					"Failed to disable PCI access by F/W\n");
 		}
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@ -68,9 +68,9 @@ int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,

 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
 {
-	struct armcp_packet pkt = {};
+	struct cpucp_packet pkt = {};

-	pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT);

 	return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
 						sizeof(pkt), 0, NULL);
@ -79,7 +79,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 				u16 len, u32 timeout, long *result)
 {
-	struct armcp_packet *pkt;
+	struct cpucp_packet *pkt;
 	dma_addr_t pkt_dma_addr;
 	u32 tmp;
 	int rc = 0;
@ -111,7 +111,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	}

 	rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
-				(tmp == ARMCP_PACKET_FENCE_VAL), 1000,
+				(tmp == CPUCP_PACKET_FENCE_VAL), 1000,
 				timeout, true);

 	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
@ -124,12 +124,12 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,

 	tmp = le32_to_cpu(pkt->ctl);

-	rc = (tmp & ARMCP_PKT_CTL_RC_MASK) >> ARMCP_PKT_CTL_RC_SHIFT;
+	rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
 	if (rc) {
 		dev_err(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
 			rc,
-			(tmp & ARMCP_PKT_CTL_OPCODE_MASK)
-						>> ARMCP_PKT_CTL_OPCODE_SHIFT);
+			(tmp & CPUCP_PKT_CTL_OPCODE_MASK)
+						>> CPUCP_PKT_CTL_OPCODE_SHIFT);
 		rc = -EIO;
 	} else if (result) {
 		*result = (long) le64_to_cpu(pkt->result);
@ -145,14 +145,14 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,

 int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.value = cpu_to_le64(event_type);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@ -167,15 +167,15 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
 int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 		size_t irq_arr_size)
 {
-	struct armcp_unmask_irq_arr_packet *pkt;
+	struct cpucp_unmask_irq_arr_packet *pkt;
 	size_t total_pkt_size;
 	long result;
 	int rc;

-	total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) +
+	total_pkt_size = sizeof(struct cpucp_unmask_irq_arr_packet) +
 			irq_arr_size;

-	/* data should be aligned to 8 bytes in order to ArmCP to copy it */
+	/* data should be aligned to 8 bytes in order to CPU-CP to copy it */
 	total_pkt_size = (total_pkt_size + 0x7) & ~0x7;

 	/* total_pkt_size is casted to u16 later on */
@ -191,8 +191,8 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 	pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0]));
 	memcpy(&pkt->irqs, irq_arr, irq_arr_size);

-	pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
 						total_pkt_size, 0, &result);
@ -207,19 +207,19 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,

 int hl_fw_test_cpu_queue(struct hl_device *hdev)
 {
-	struct armcp_packet test_pkt = {};
+	struct cpucp_packet test_pkt = {};
 	long result;
 	int rc;

-	test_pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEST <<
-					ARMCP_PKT_CTL_OPCODE_SHIFT);
-	test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
+	test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
+	test_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
 						sizeof(test_pkt), 0, &result);

 	if (!rc) {
-		if (result != ARMCP_PACKET_FENCE_VAL)
+		if (result != CPUCP_PACKET_FENCE_VAL)
 			dev_err(hdev->dev,
 				"CPU queue test failed (0x%08lX)\n", result);
 	} else {
@ -251,61 +251,61 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,

 int hl_fw_send_heartbeat(struct hl_device *hdev)
 {
-	struct armcp_packet hb_pkt = {};
+	struct cpucp_packet hb_pkt = {};
 	long result;
 	int rc;

-	hb_pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEST <<
-					ARMCP_PKT_CTL_OPCODE_SHIFT);
-	hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
+	hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
+	hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
 						sizeof(hb_pkt), 0, &result);

-	if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
+	if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
 		rc = -EIO;

 	return rc;
 }

-int hl_fw_armcp_info_get(struct hl_device *hdev)
+int hl_fw_cpucp_info_get(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct armcp_packet pkt = {};
-	void *armcp_info_cpu_addr;
-	dma_addr_t armcp_info_dma_addr;
+	struct cpucp_packet pkt = {};
+	void *cpucp_info_cpu_addr;
+	dma_addr_t cpucp_info_dma_addr;
 	long result;
 	int rc;

-	armcp_info_cpu_addr =
+	cpucp_info_cpu_addr =
 			hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
-					sizeof(struct armcp_info),
-					&armcp_info_dma_addr);
-	if (!armcp_info_cpu_addr) {
+					sizeof(struct cpucp_info),
+					&cpucp_info_dma_addr);
+	if (!cpucp_info_cpu_addr) {
 		dev_err(hdev->dev,
-			"Failed to allocate DMA memory for ArmCP info packet\n");
+			"Failed to allocate DMA memory for CPU-CP info packet\n");
 		return -ENOMEM;
 	}

-	memset(armcp_info_cpu_addr, 0, sizeof(struct armcp_info));
+	memset(cpucp_info_cpu_addr, 0, sizeof(struct cpucp_info));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_INFO_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
-	pkt.addr = cpu_to_le64(armcp_info_dma_addr);
-	pkt.data_max_size = cpu_to_le32(sizeof(struct armcp_info));
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_INFO_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.addr = cpu_to_le64(cpucp_info_dma_addr);
+	pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_info));

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					HL_ARMCP_INFO_TIMEOUT_USEC, &result);
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to handle ArmCP info pkt, error %d\n", rc);
+			"Failed to handle CPU-CP info pkt, error %d\n", rc);
 		goto out;
 	}

-	memcpy(&prop->armcp_info, armcp_info_cpu_addr,
-			sizeof(prop->armcp_info));
+	memcpy(&prop->cpucp_info, cpucp_info_cpu_addr,
+			sizeof(prop->cpucp_info));

-	rc = hl_build_hwmon_channel_info(hdev, prop->armcp_info.sensors);
+	rc = hl_build_hwmon_channel_info(hdev, prop->cpucp_info.sensors);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to build hwmon channel info, error %d\n", rc);
@ -315,14 +315,14 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)

 out:
 	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
-			sizeof(struct armcp_info), armcp_info_cpu_addr);
+			sizeof(struct cpucp_info), cpucp_info_cpu_addr);

 	return rc;
 }

 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 {
-	struct armcp_packet pkt = {};
+	struct cpucp_packet pkt = {};
 	void *eeprom_info_cpu_addr;
 	dma_addr_t eeprom_info_dma_addr;
 	long result;
@ -333,23 +333,24 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 					max_size, &eeprom_info_dma_addr);
 	if (!eeprom_info_cpu_addr) {
 		dev_err(hdev->dev,
-			"Failed to allocate DMA memory for ArmCP EEPROM packet\n");
+			"Failed to allocate DMA memory for CPU-CP EEPROM packet\n");
 		return -ENOMEM;
 	}

 	memset(eeprom_info_cpu_addr, 0, max_size);

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_EEPROM_DATA_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.addr = cpu_to_le64(eeprom_info_dma_addr);
 	pkt.data_max_size = cpu_to_le32(max_size);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-			HL_ARMCP_EEPROM_TIMEOUT_USEC, &result);
+			HL_CPUCP_EEPROM_TIMEOUT_USEC, &result);

 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to handle ArmCP EEPROM packet, error %d\n", rc);
+			"Failed to handle CPU-CP EEPROM packet, error %d\n",
+			rc);
 		goto out;
 	}

@ -363,6 +364,77 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 	return rc;
 }

+int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
+		struct hl_info_pci_counters *counters)
+{
+	struct cpucp_packet pkt = {};
+	long result;
+	int rc;
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET <<
+			CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+	/* Fetch PCI rx counter */
+	pkt.index = cpu_to_le32(cpucp_pcie_throughput_rx);
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		return rc;
+	}
+	counters->rx_throughput = result;
+
+	/* Fetch PCI tx counter */
+	pkt.index = cpu_to_le32(cpucp_pcie_throughput_tx);
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		return rc;
+	}
+	counters->tx_throughput = result;
+
+	/* Fetch PCI replay counter */
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_REPLAY_CNT_GET <<
+			CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		return rc;
+	}
+	counters->replay_cnt = (u32) result;
+
+	return rc;
+}
+
+int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
+{
+	struct cpucp_packet pkt = {};
+	long result;
+	int rc;
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TOTAL_ENERGY_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CpuCP total energy pkt, error %d\n",
+				rc);
+		return rc;
+	}
+
+	*total_energy = result;
+
+	return rc;
+}
+
 static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
 {
 	u32 err_val;
@ -402,8 +474,11 @@ static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
 			"Device boot error - NIC F/W initialization failed\n");
 }

-static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status)
+static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 {
+	/* Some of the status codes below are deprecated in newer f/w
+	 * versions but we keep them here for backward compatibility
+	 */
 	switch (status) {
 	case CPU_BOOT_STATUS_NA:
 		dev_err(hdev->dev,
@ -449,6 +524,48 @@ static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 	}
 }

+int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
+				u32 boot_err0_reg, u32 timeout)
+{
+	u32 status;
+	int rc;
+
+	if (!hdev->cpu_enable)
+		return 0;
+
+	/* Need to check two possible scenarios:
+	 *
+	 * CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT - for newer firmwares where
+	 * the preboot is waiting for the boot fit
+	 *
+	 * All other status values - for older firmwares where the uboot was
+	 * loaded from the FLASH
+	 */
+	rc = hl_poll_timeout(
+		hdev,
+		cpu_boot_status_reg,
+		status,
+		(status == CPU_BOOT_STATUS_IN_UBOOT) ||
+		(status == CPU_BOOT_STATUS_DRAM_RDY) ||
+		(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
+		(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
+		(status == CPU_BOOT_STATUS_SRAM_AVAIL) ||
+		(status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT),
+		10000,
+		timeout);
+
+	if (rc) {
+		dev_err(hdev->dev, "Failed to read preboot version\n");
+		detect_cpu_boot_status(hdev, status);
+		fw_read_errors(hdev, boot_err0_reg);
+		return -EIO;
+	}
+
+	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
+
+	return 0;
+}
+
 int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 			u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
 			u32 boot_err0_reg, bool skip_bmc,
@ -514,15 +631,11 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 		10000,
 		cpu_timeout);

-	/* Read U-Boot, preboot versions now in case we will later fail */
+	/* Read U-Boot version now in case we will later fail */
 	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_UBOOT);
-	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);

-	/* Some of the status codes below are deprecated in newer f/w
-	 * versions but we keep them here for backward compatibility
-	 */
 	if (rc) {
-		hl_detect_cpu_boot_status(hdev, status);
+		detect_cpu_boot_status(hdev, status);
 		rc = -EIO;
 		goto out;
 	}
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@ -8,21 +8,33 @@
 #ifndef HABANALABSP_H_
 #define HABANALABSP_H_

-#include "../include/common/armcp_if.h"
+#include "../include/common/cpucp_if.h"
 #include "../include/common/qman_if.h"
 #include <uapi/misc/habanalabs.h>

 #include <linux/cdev.h>
 #include <linux/iopoll.h>
 #include <linux/irqreturn.h>
-#include <linux/dma-fence.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
 #include <linux/hashtable.h>
+#include <linux/bitfield.h>

 #define HL_NAME				"habanalabs"

-#define HL_MMAP_CB_MASK			(0x8000000000000000ull >> PAGE_SHIFT)
+/* Use upper bits of mmap offset to store habana driver specific information.
+ * bits[63:62] - Encode mmap type
+ * bits[45:0]  - mmap offset value
+ *
+ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
+ *  defines are w.r.t to PAGE_SIZE
+ */
+#define HL_MMAP_TYPE_SHIFT		(62 - PAGE_SHIFT)
+#define HL_MMAP_TYPE_MASK		(0x3ull << HL_MMAP_TYPE_SHIFT)
+#define HL_MMAP_TYPE_CB			(0x2ull << HL_MMAP_TYPE_SHIFT)
+
+#define HL_MMAP_OFFSET_VALUE_MASK	(0x3FFFFFFFFFFFull >> PAGE_SHIFT)
+#define HL_MMAP_OFFSET_VALUE_GET(off)	(off & HL_MMAP_OFFSET_VALUE_MASK)

 #define HL_PENDING_RESET_PER_SEC	30

@ -34,8 +46,8 @@

 #define HL_PLL_LOW_JOB_FREQ_USEC	5000000 /* 5 s */

-#define HL_ARMCP_INFO_TIMEOUT_USEC	10000000 /* 10s */
-#define HL_ARMCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */
+#define HL_CPUCP_INFO_TIMEOUT_USEC	10000000 /* 10s */
+#define HL_CPUCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */

 #define HL_PCI_ELBI_TIMEOUT_MSEC	10 /* 10ms */

@ -66,6 +78,8 @@

 #define HL_PCI_NUM_BARS			6

+#define HL_MAX_DCORES			4
+
 /**
 * struct pgt_info - MMU hop page info.
 * @node: hash linked-list node for the pgts shadow hash of pgts.
@ -222,12 +236,15 @@ enum hl_device_hw_state {
 * @hop2_shift: shift of hop 2 mask.
 * @hop3_shift: shift of hop 3 mask.
 * @hop4_shift: shift of hop 4 mask.
+ * @hop5_shift: shift of hop 5 mask.
 * @hop0_mask: mask to get the PTE address in hop 0.
 * @hop1_mask: mask to get the PTE address in hop 1.
 * @hop2_mask: mask to get the PTE address in hop 2.
 * @hop3_mask: mask to get the PTE address in hop 3.
 * @hop4_mask: mask to get the PTE address in hop 4.
+ * @hop5_mask: mask to get the PTE address in hop 5.
 * @page_size: default page size used to allocate memory.
+ * @num_hops: The amount of hops supported by the translation table.
 */
 struct hl_mmu_properties {
 	u64	start_addr;
@ -237,18 +254,21 @@ struct hl_mmu_properties {
 	u64	hop2_shift;
 	u64	hop3_shift;
 	u64	hop4_shift;
+	u64	hop5_shift;
 	u64	hop0_mask;
 	u64	hop1_mask;
 	u64	hop2_mask;
 	u64	hop3_mask;
 	u64	hop4_mask;
+	u64	hop5_mask;
 	u32	page_size;
+	u32	num_hops;
 };

 /**
 * struct asic_fixed_properties - ASIC specific immutable properties.
 * @hw_queues_props: H/W queues properties.
- * @armcp_info: received various information from ArmCP regarding the H/W, e.g.
+ * @cpucp_info: received various information from CPU-CP regarding the H/W, e.g.
 *		available sensors.
 * @uboot_ver: F/W U-boot version.
 * @preboot_ver: F/W Preboot version.
@ -271,6 +291,10 @@ struct hl_mmu_properties {
 * @pcie_aux_dbi_reg_addr: Address of the PCIE_AUX DBI register.
 * @mmu_pgt_addr: base physical address in DRAM of MMU page tables.
 * @mmu_dram_default_page_addr: DRAM default page physical address.
+ * @cb_va_start_addr: virtual start address of command buffers which are mapped
+ *                    to the device's MMU.
+ * @cb_va_end_addr: virtual end address of command buffers which are mapped to
+ *                  the device's MMU.
 * @mmu_pgt_size: MMU page tables total size.
 * @mmu_pte_size: PTE size in MMU page tables.
 * @mmu_hop_table_size: MMU hop table size.
@ -292,12 +316,16 @@ struct hl_mmu_properties {
 * @max_queues: maximum amount of queues in the system
 * @sync_stream_first_sob: first sync object available for sync stream use
 * @sync_stream_first_mon: first monitor available for sync stream use
+ * @first_available_user_sob: first sob available for the user
+ * @first_available_user_mon: first monitor available for the user
 * @tpc_enabled_mask: which TPCs are enabled.
 * @completion_queues_count: number of completion queues.
+ * @fw_security_disabled: true if security measures are disabled in firmware,
+ *                        false otherwise
 */
 struct asic_fixed_properties {
 	struct hw_queue_properties	*hw_queues_props;
-	struct armcp_info		armcp_info;
+	struct cpucp_info		cpucp_info;
 	char				uboot_ver[VERSION_MAX_LEN];
 	char				preboot_ver[VERSION_MAX_LEN];
 	struct hl_mmu_properties	dmmu;
@ -317,6 +345,8 @@ struct asic_fixed_properties {
 	u64				pcie_aux_dbi_reg_addr;
 	u64				mmu_pgt_addr;
 	u64				mmu_dram_default_page_addr;
+	u64				cb_va_start_addr;
+	u64				cb_va_end_addr;
 	u32				mmu_pgt_size;
 	u32				mmu_pte_size;
 	u32				mmu_hop_table_size;
@ -338,13 +368,29 @@ struct asic_fixed_properties {
 	u32				max_queues;
 	u16				sync_stream_first_sob;
 	u16				sync_stream_first_mon;
+	u16				first_available_user_sob[HL_MAX_DCORES];
+	u16				first_available_user_mon[HL_MAX_DCORES];
 	u8				tpc_enabled_mask;
 	u8				completion_queues_count;
+	u8				fw_security_disabled;
+};
+
+/**
+ * struct hl_fence - software synchronization primitive
+ * @completion: fence is implemented using completion
+ * @refcount: refcount for this fence
+ * @error: mark this fence with error
+ *
+ */
+struct hl_fence {
+	struct completion	completion;
+	struct kref		refcount;
+	int			error;
 };

 /**
 * struct hl_cs_compl - command submission completion object.
- * @base_fence: kernel fence object.
+ * @base_fence: hl fence object.
 * @lock: spinlock to protect fence.
 * @hdev: habanalabs device structure.
 * @hw_sob: the H/W SOB used in this signal/wait CS.
@ -353,7 +399,7 @@ struct asic_fixed_properties {
 * @sob_val: the SOB value that is used in this signal/wait CS.
 */
 struct hl_cs_compl {
-	struct dma_fence	base_fence;
+	struct hl_fence		base_fence;
 	spinlock_t		lock;
 	struct hl_device	*hdev;
 	struct hl_hw_sob	*hw_sob;
@ -380,36 +426,41 @@ struct hl_cb_mgr {
 * struct hl_cb - describes a Command Buffer.
 * @refcount: reference counter for usage of the CB.
 * @hdev: pointer to device this CB belongs to.
+ * @ctx: pointer to the CB owner's context.
 * @lock: spinlock to protect mmap/cs flows.
 * @debugfs_list: node in debugfs list of command buffers.
 * @pool_list: node in pool list of command buffers.
+ * @va_block_list: list of virtual addresses blocks of the CB if it is mapped to
+ *                 the device's MMU.
+ * @id: the CB's ID.
 * @kernel_address: Holds the CB's kernel virtual address.
 * @bus_address: Holds the CB's DMA address.
 * @mmap_size: Holds the CB's size that was mmaped.
 * @size: holds the CB's size.
- * @id: the CB's ID.
 * @cs_cnt: holds number of CS that this CB participates in.
- * @ctx_id: holds the ID of the owner's context.
 * @mmap: true if the CB is currently mmaped to user.
 * @is_pool: true if CB was acquired from the pool, false otherwise.
 * @is_internal: internaly allocated
+ * @is_mmu_mapped: true if the CB is mapped to the device's MMU.
 */
 struct hl_cb {
 	struct kref		refcount;
 	struct hl_device	*hdev;
+	struct hl_ctx		*ctx;
 	spinlock_t		lock;
 	struct list_head	debugfs_list;
 	struct list_head	pool_list;
+	struct list_head	va_block_list;
+	u64			id;
 	u64			kernel_address;
 	dma_addr_t		bus_address;
 	u32			mmap_size;
 	u32			size;
-	u32			id;
 	u32			cs_cnt;
-	u32			ctx_id;
 	u8			mmap;
 	u8			is_pool;
 	u8			is_internal;
+	u8			is_mmu_mapped;
 };


@ -435,7 +486,7 @@ struct hl_cs_job;
 #define HL_EQ_LENGTH			64
 #define HL_EQ_SIZE_IN_BYTES		(HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE)

-/* Host <-> ArmCP shared memory size */
+/* Host <-> CPU-CP shared memory size */
 #define HL_CPU_ACCESSIBLE_MEM_SIZE	SZ_2M

 /**
@ -617,7 +668,7 @@ enum div_select_defs {
 * @debugfs_read32: debug interface for reading u32 from DRAM/SRAM.
 * @debugfs_write32: debug interface for writing u32 to DRAM/SRAM.
 * @add_device_attr: add ASIC specific device attributes.
- * @handle_eqe: handle event queue entry (IRQ) from ArmCP.
+ * @handle_eqe: handle event queue entry (IRQ) from CPU-CP.
 * @set_pll_profile: change PLL profile (manual/automatic).
 * @get_events_stat: retrieve event queue entries histogram.
 * @read_pte: read MMU page table entry from DRAM.
@ -626,7 +677,7 @@ enum div_select_defs {
 *                        (L1 only) or hard (L0 & L1) flush.
 * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
 *                              ASID-VA-size mask.
- * @send_heartbeat: send is-alive packet to ArmCP and verify response.
+ * @send_heartbeat: send is-alive packet to CPU-CP and verify response.
 * @set_clock_gating: enable/disable clock gating per engine according to
 *                    clock gating mask in hdev
 * @disable_clock_gating: disable clock gating completely
@ -644,8 +695,6 @@ enum div_select_defs {
 *                    ASIC
 * @get_hw_state: retrieve the H/W state
 * @pci_bars_map: Map PCI BARs.
- * @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
- *                     old address the bar pointed to or U64_MAX for failure
 * @init_iatu: Initialize the iATU unit inside the PCI controller.
 * @rreg: Read a register. Needed for simulator support.
 * @wreg: Write a register. Needed for simulator support.
@ -679,7 +728,7 @@ struct hl_asic_funcs {
 	int (*suspend)(struct hl_device *hdev);
 	int (*resume)(struct hl_device *hdev);
 	int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
-			u64 kaddress, phys_addr_t paddress, u32 size);
+			void *cpu_addr, dma_addr_t dma_addr, size_t size);
 	void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
 	void (*pqe_write)(struct hl_device *hdev, __le64 *pqe,
 			struct hl_bd *bd);
@ -736,7 +785,7 @@ struct hl_asic_funcs {
 	void (*set_clock_gating)(struct hl_device *hdev);
 	void (*disable_clock_gating)(struct hl_device *hdev);
 	int (*debug_coresight)(struct hl_device *hdev, void *data);
-	bool (*is_device_idle)(struct hl_device *hdev, u32 *mask,
+	bool (*is_device_idle)(struct hl_device *hdev, u64 *mask,
 				struct seq_file *s);
 	int (*soft_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
@ -748,7 +797,6 @@ struct hl_asic_funcs {
 				u16 len, u32 timeout, long *result);
 	enum hl_device_hw_state (*get_hw_state)(struct hl_device *hdev);
 	int (*pci_bars_map)(struct hl_device *hdev);
-	u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
 	int (*init_iatu)(struct hl_device *hdev);
 	u32 (*rreg)(struct hl_device *hdev, u32 reg);
 	void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
@ -800,7 +848,7 @@ struct hl_va_range {
 * @hdev: pointer to the device structure.
 * @refcount: reference counter for the context. Context is released only when
 *		this hits 0l. It is incremented on CS and CS_WAIT.
- * @cs_pending: array of DMA fence objects representing pending CS.
+ * @cs_pending: array of hl fence objects representing pending CS.
 * @host_va_range: holds available virtual addresses for host mappings.
 * @host_huge_va_range: holds available virtual addresses for host mappings
 *                      with huge pages.
@ -809,6 +857,8 @@ struct hl_va_range {
 * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
 *            MMU hash or walking the PGT requires talking this lock.
 * @debugfs_list: node in debugfs list of contexts.
+ * @cb_va_pool: device VA pool for command buffers which are mapped to the
+ *              device's MMU.
 * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
 *			to user so user could inquire about CS. It is used as
 *			index to cs_pending array.
@ -832,7 +882,7 @@ struct hl_ctx {
 	struct hl_fpriv		*hpriv;
 	struct hl_device	*hdev;
 	struct kref		refcount;
-	struct dma_fence	**cs_pending;
+	struct hl_fence		**cs_pending;
 	struct hl_va_range	*host_va_range;
 	struct hl_va_range	*host_huge_va_range;
 	struct hl_va_range	*dram_va_range;
@ -840,6 +890,7 @@ struct hl_ctx {
 	struct mutex		mmu_lock;
 	struct list_head	debugfs_list;
 	struct hl_cs_counters	cs_counters;
+	struct gen_pool		*cb_va_pool;
 	u64			cs_sequence;
 	u64			*dram_default_hops;
 	spinlock_t		cs_lock;
@ -919,8 +970,8 @@ struct hl_cs {
 	struct list_head	job_list;
 	spinlock_t		job_lock;
 	struct kref		refcount;
-	struct dma_fence	*fence;
-	struct dma_fence	*signal_fence;
+	struct hl_fence		*fence;
+	struct hl_fence		*signal_fence;
 	struct work_struct	finish_work;
 	struct delayed_work	work_tdr;
 	struct list_head	mirror_node;
@ -1395,6 +1446,44 @@ struct hl_device_idle_busy_ts {
 	ktime_t				busy_to_idle_ts;
 };

+
+/**
+ * struct hl_mmu_priv - used for holding per-device mmu internal information.
+ * @mmu_pgt_pool: pool of page tables used by MMU for allocating hops.
+ * @mmu_shadow_hop0: shadow array of hop0 tables.
+ */
+struct hl_mmu_priv {
+	struct gen_pool *mmu_pgt_pool;
+	void *mmu_shadow_hop0;
+};
+
+/**
+ * struct hl_mmu_funcs - Device related MMU functions.
+ * @init: initialize the MMU module.
+ * @fini: release the MMU module.
+ * @ctx_init: Initialize a context for using the MMU module.
+ * @ctx_fini: disable a ctx from using the mmu module.
+ * @map: maps a virtual address to physical address for a context.
+ * @unmap: unmap a virtual address of a context.
+ * @flush: flush all writes from all cores to reach device MMU.
+ * @swap_out: marks all mapping of the given context as swapped out.
+ * @swap_in: marks all mapping of the given context as swapped in.
+ */
+struct hl_mmu_funcs {
+	int (*init)(struct hl_device *hdev);
+	void (*fini)(struct hl_device *hdev);
+	int (*ctx_init)(struct hl_ctx *ctx);
+	void (*ctx_fini)(struct hl_ctx *ctx);
+	int (*map)(struct hl_ctx *ctx,
+			u64 virt_addr, u64 phys_addr, u32 page_size,
+			bool is_dram_addr);
+	int (*unmap)(struct hl_ctx *ctx,
+			u64 virt_addr, bool is_dram_addr);
+	void (*flush)(struct hl_ctx *ctx);
+	void (*swap_out)(struct hl_ctx *ctx);
+	void (*swap_in)(struct hl_ctx *ctx);
+};
+
 /**
 * struct hl_device - habanalabs device structure.
 * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@ -1407,8 +1496,8 @@ struct hl_device_idle_busy_ts {
 * @dev: related kernel basic device structure.
 * @dev_ctrl: related kernel device structure for the control device
 * @work_freq: delayed work to lower device frequency if possible.
- * @work_heartbeat: delayed work for ArmCP is-alive check.
- * @asic_name: ASIC specific nmae.
+ * @work_heartbeat: delayed work for CPU-CP is-alive check.
+ * @asic_name: ASIC specific name.
 * @asic_type: ASIC specific type.
 * @completion_queue: array of hl_cq.
 * @cq_wq: work queues of completion queues for executing work in process
@ -1419,22 +1508,20 @@ struct hl_device_idle_busy_ts {
 * @hw_queues_mirror_list: CS mirror list for TDR.
 * @hw_queues_mirror_lock: protects hw_queues_mirror_list.
 * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
- * @event_queue: event queue for IRQ from ArmCP.
+ * @event_queue: event queue for IRQ from CPU-CP.
 * @dma_pool: DMA pool for small allocations.
- * @cpu_accessible_dma_mem: Host <-> ArmCP shared memory CPU address.
- * @cpu_accessible_dma_address: Host <-> ArmCP shared memory DMA address.
- * @cpu_accessible_dma_pool: Host <-> ArmCP shared memory pool.
+ * @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address.
+ * @cpu_accessible_dma_address: Host <-> CPU-CP shared memory DMA address.
+ * @cpu_accessible_dma_pool: Host <-> CPU-CP shared memory pool.
 * @asid_bitmap: holds used/available ASIDs.
 * @asid_mutex: protects asid_bitmap.
- * @send_cpu_message_lock: enforces only one message in Host <-> ArmCP queue.
+ * @send_cpu_message_lock: enforces only one message in Host <-> CPU-CP queue.
 * @debug_lock: protects critical section of setting debug mode for device
 * @asic_prop: ASIC specific immutable properties.
 * @asic_funcs: ASIC specific functions.
 * @asic_specific: ASIC specific information to use only from ASIC files.
- * @mmu_pgt_pool: pool of available MMU hops.
 * @vm: virtual memory manager for MMU.
 * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context.
- * @mmu_shadow_hop0: shadow mapping of the MMU hop 0 zone.
 * @hwmon_dev: H/W monitor device.
 * @pm_mng_profile: current power management profile.
 * @hl_chip_info: ASIC's sensors information.
@ -1452,6 +1539,8 @@ struct hl_device_idle_busy_ts {
 * @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy
 *                    and vice-versa
 * @aggregated_cs_counters: aggregated cs counters among all contexts
+ * @mmu_priv: device-specific MMU data.
+ * @mmu_func: device-related MMU functions.
 * @dram_used_mem: current DRAM memory consumption.
 * @timeout_jiffies: device CS timeout value.
 * @max_power: the max power of the device, as configured by the sysadmin. This
@ -1471,6 +1560,7 @@ struct hl_device_idle_busy_ts {
 * @soft_reset_cnt: number of soft reset since the driver was loaded.
 * @hard_reset_cnt: number of hard reset since the driver was loaded.
 * @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
+ * @clk_throttling_reason: bitmask represents the current clk throttling reasons
 * @id: device minor.
 * @id_control: minor of the control device
 * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@ -1479,7 +1569,7 @@ struct hl_device_idle_busy_ts {
 * @late_init_done: is late init stage was done during initialization.
 * @hwmon_initialized: is H/W monitor sensors was initialized.
 * @hard_reset_pending: is there a hard reset work pending.
- * @heartbeat: is heartbeat sanity check towards ArmCP enabled.
+ * @heartbeat: is heartbeat sanity check towards CPU-CP enabled.
 * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
 *                   otherwise.
 * @dram_supports_virtual_memory: is MMU enabled towards DRAM.
@ -1501,6 +1591,7 @@ struct hl_device_idle_busy_ts {
 * @sync_stream_queue_idx: helper index for sync stream queues initialization.
 * @supports_coresight: is CoreSight supported.
 * @supports_soft_reset: is soft reset supported.
+ * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
 */
 struct hl_device {
 	struct pci_dev			*pdev;
@ -1513,7 +1604,7 @@ struct hl_device {
 	struct device			*dev_ctrl;
 	struct delayed_work		work_freq;
 	struct delayed_work		work_heartbeat;
-	char				asic_name[16];
+	char				asic_name[32];
 	enum hl_asic_type		asic_type;
 	struct hl_cq			*completion_queue;
 	struct workqueue_struct		**cq_wq;
@ -1535,10 +1626,8 @@ struct hl_device {
 	struct asic_fixed_properties	asic_prop;
 	const struct hl_asic_funcs	*asic_funcs;
 	void				*asic_specific;
-	struct gen_pool			*mmu_pgt_pool;
 	struct hl_vm			vm;
 	struct mutex			mmu_cache_lock;
-	void				*mmu_shadow_hop0;
 	struct device			*hwmon_dev;
 	enum hl_pm_mng_profile		pm_mng_profile;
 	struct hwmon_chip_info		*hl_chip_info;
@ -1562,19 +1651,23 @@ struct hl_device {

 	struct hl_cs_counters		aggregated_cs_counters;

+	struct hl_mmu_priv		mmu_priv;
+	struct hl_mmu_funcs		mmu_func;
+
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
 	u64				clock_gating_mask;
 	atomic_t			in_reset;
 	enum hl_pll_frequency		curr_pll_profile;
-	enum armcp_card_types		card_type;
+	enum cpucp_card_types		card_type;
 	int				cs_active_cnt;
 	u32				major;
 	u32				high_pll;
 	u32				soft_reset_cnt;
 	u32				hard_reset_cnt;
 	u32				idle_busy_ts_idx;
+	u32				clk_throttling_reason;
 	u16				id;
 	u16				id_control;
 	u16				cpu_pci_msb_addr;
@ -1598,6 +1691,7 @@ struct hl_device {
 	u8				sync_stream_queue_idx;
 	u8				supports_coresight;
 	u8				supports_soft_reset;
+	u8				supports_cb_mapping;

 	/* Parameters for bring-up */
 	u8				mmu_enable;
@ -1739,7 +1833,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
 void hl_ctx_do_release(struct kref *ref);
 void hl_ctx_get(struct hl_device *hdev,	struct hl_ctx *ctx);
 int hl_ctx_put(struct hl_ctx *ctx);
-struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
+struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
 void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
 void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);

@ -1755,7 +1849,7 @@ int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
 uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms);

 int hl_build_hwmon_channel_info(struct hl_device *hdev,
-		struct armcp_sensor *sensors_arr);
+		struct cpucp_sensor *sensors_arr);

 int hl_sysfs_init(struct hl_device *hdev);
 void hl_sysfs_fini(struct hl_device *hdev);
@ -1763,8 +1857,9 @@ void hl_sysfs_fini(struct hl_device *hdev);
 int hl_hwmon_init(struct hl_device *hdev);
 void hl_hwmon_fini(struct hl_device *hdev);

-int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size,
-		u64 *handle, int ctx_id, bool internal_cb);
+int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
+			struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
+			bool map_cb, u64 *handle);
 int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
 int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
 struct hl_cb *hl_cb_get(struct hl_device *hdev,	struct hl_cb_mgr *mgr,
@ -1776,11 +1871,15 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 					bool internal_cb);
 int hl_cb_pool_init(struct hl_device *hdev);
 int hl_cb_pool_fini(struct hl_device *hdev);
+int hl_cb_va_pool_init(struct hl_ctx *ctx);
+void hl_cb_va_pool_fini(struct hl_ctx *ctx);

 void hl_cs_rollback_all(struct hl_device *hdev);
 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 		enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
 void hl_sob_reset_error(struct kref *ref);
+void hl_fence_put(struct hl_fence *fence);
+void hl_fence_get(struct hl_fence *fence);

 void goya_set_asic_funcs(struct hl_device *hdev);
 void gaudi_set_asic_funcs(struct hl_device *hdev);
@ -1810,6 +1909,8 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 		bool flush_pte);
 void hl_mmu_swap_out(struct hl_ctx *ctx);
 void hl_mmu_swap_in(struct hl_ctx *ctx);
+int hl_mmu_if_set_funcs(struct hl_device *hdev);
+void hl_mmu_v1_set_funcs(struct hl_device *hdev);

 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 				void __iomem *dst);
@ -1825,23 +1926,28 @@ void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
 void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 					void *vaddr);
 int hl_fw_send_heartbeat(struct hl_device *hdev);
-int hl_fw_armcp_info_get(struct hl_device *hdev);
+int hl_fw_cpucp_info_get(struct hl_device *hdev);
 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
+int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
+		struct hl_info_pci_counters *counters);
+int hl_fw_cpucp_total_energy_get(struct hl_device *hdev,
+			u64 *total_energy);
 int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 			u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
 			u32 boot_err0_reg, bool skip_bmc,
 			u32 cpu_timeout, u32 boot_fit_timeout);
+int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
+				u32 boot_err0_reg, u32 timeout);

 int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
 			bool is_wc[3]);
 int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data);
-int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
-				u64 addr);
 int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region,
 		struct hl_inbound_pci_region *pci_region);
 int hl_pci_set_outbound_region(struct hl_device *hdev,
 		struct hl_outbound_pci_region *pci_region);
-int hl_pci_init(struct hl_device *hdev);
+int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
+		u32 boot_err0_reg, u32 preboot_ver_timeout);
 void hl_pci_fini(struct hl_device *hdev);

 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@ -11,6 +11,7 @@
 #include "habanalabs.h"

 #include <linux/pci.h>
+#include <linux/aer.h>
 #include <linux/module.h>

 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
@ -408,6 +409,8 @@ static int hl_pci_probe(struct pci_dev *pdev,

 	pci_set_drvdata(pdev, hdev);

+	pci_enable_pcie_error_reporting(pdev);
+
 	rc = hl_device_init(hdev, hl_class);
 	if (rc) {
 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
@ -440,22 +443,93 @@ static void hl_pci_remove(struct pci_dev *pdev)
 		return;

 	hl_device_fini(hdev);
+	pci_disable_pcie_error_reporting(pdev);
 	pci_set_drvdata(pdev, NULL);
-
 	destroy_hdev(hdev);
 }

+/**
+ * hl_pci_err_detected - a PCI bus error detected on this device
+ *
+ * @pdev: pointer to pci device
+ * @state: PCI error type
+ *
+ * Called by the PCI subsystem whenever a non-correctable
+ * PCI bus error is detected
+ */
+static pci_ers_result_t
+hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct hl_device *hdev = pci_get_drvdata(pdev);
+	enum pci_ers_result result;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+
+	case pci_channel_io_frozen:
+		dev_warn(hdev->dev, "frozen state error detected\n");
+		result = PCI_ERS_RESULT_NEED_RESET;
+		break;
+
+	case pci_channel_io_perm_failure:
+		dev_warn(hdev->dev, "failure state error detected\n");
+		result = PCI_ERS_RESULT_DISCONNECT;
+		break;
+
+	default:
+		result = PCI_ERS_RESULT_NONE;
+	}
+
+	hdev->asic_funcs->halt_engines(hdev, true);
+
+	return result;
+}
+
+/**
+ * hl_pci_err_resume - resume after a PCI slot reset
+ *
+ * @pdev: pointer to pci device
+ *
+ */
+static void hl_pci_err_resume(struct pci_dev *pdev)
+{
+	struct hl_device *hdev = pci_get_drvdata(pdev);
+
+	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
+	hl_device_resume(hdev);
+}
+
+/**
+ * hl_pci_err_slot_reset - a PCI slot reset has just happened
+ *
+ * @pdev: pointer to pci device
+ *
+ * Determine if the driver can recover from the PCI slot reset
+ */
+static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
+{
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
 static const struct dev_pm_ops hl_pm_ops = {
 	.suspend = hl_pmops_suspend,
 	.resume = hl_pmops_resume,
 };

+static const struct pci_error_handlers hl_pci_err_handler = {
+	.error_detected = hl_pci_err_detected,
+	.slot_reset = hl_pci_err_slot_reset,
+	.resume = hl_pci_err_resume,
+};
+
 static struct pci_driver hl_pci_driver = {
 	.name = HL_NAME,
 	.id_table = ids,
 	.probe = hl_pci_probe,
 	.remove = hl_pci_remove,
 	.driver.pm = &hl_pm_ops,
+	.err_handler = &hl_pci_err_handler,
 };

 /*
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@ -8,6 +8,7 @@
 #include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"

+#include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
@ -64,14 +65,14 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 		hw_ip.dram_enabled = 1;
 	hw_ip.num_of_events = prop->num_of_events;

-	memcpy(hw_ip.armcp_version, prop->armcp_info.armcp_version,
+	memcpy(hw_ip.cpucp_version, prop->cpucp_info.cpucp_version,
 		min(VERSION_MAX_LEN, HL_INFO_VERSION_MAX_LEN));

-	memcpy(hw_ip.card_name, prop->armcp_info.card_name,
+	memcpy(hw_ip.card_name, prop->cpucp_info.card_name,
 		min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN));

-	hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version);
-	hw_ip.module_id = le32_to_cpu(prop->armcp_info.card_location);
+	hw_ip.cpld_version = le32_to_cpu(prop->cpucp_info.cpld_version);
+	hw_ip.module_id = le32_to_cpu(prop->cpucp_info.card_location);

 	hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
 	hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
@ -131,7 +132,7 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
 		return -EINVAL;

 	hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev,
-					&hw_idle.busy_engines_mask, NULL);
+					&hw_idle.busy_engines_mask_ext, NULL);

 	return copy_to_user(out, &hw_idle,
 		min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
@ -276,10 +277,45 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
 }

+static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_pci_counters pci_counters = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_cpucp_pci_counters_get(hdev, &pci_counters);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &pci_counters,
+		min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0;
+}
+
+static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_clk_throttle clk_throttle = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
+
+	return copy_to_user(out, &clk_throttle,
+		min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
+}
+
 static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	struct hl_device *hdev = hpriv->hdev;
-	struct hl_info_cs_counters cs_counters = {0};
+	struct hl_info_cs_counters cs_counters = { {0} };
 	u32 max_size = args->return_size;
 	void __user *out = (void __user *) (uintptr_t) args->return_pointer;

@ -297,6 +333,51 @@ static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0;
 }

+static int sync_manager_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_info_sync_manager sm_info = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	if (args->dcore_id >= HL_MAX_DCORES)
+		return -EINVAL;
+
+	sm_info.first_available_sync_object =
+			prop->first_available_user_sob[args->dcore_id];
+	sm_info.first_available_monitor =
+			prop->first_available_user_mon[args->dcore_id];
+
+
+	return copy_to_user(out, &sm_info, min_t(size_t, (size_t) max_size,
+			sizeof(sm_info))) ? -EFAULT : 0;
+}
+
+static int total_energy_consumption_info(struct hl_fpriv *hpriv,
+			struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_energy total_energy = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_cpucp_total_energy_get(hdev,
+			&total_energy.total_energy_consumption);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &total_energy,
+		min((size_t) max_size, sizeof(total_energy))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@ -360,6 +441,18 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_CS_COUNTERS:
 		return cs_counters_info(hpriv, args);

+	case HL_INFO_PCI_COUNTERS:
+		return pci_counters_info(hpriv, args);
+
+	case HL_INFO_CLK_THROTTLE_REASON:
+		return clk_throttle_info(hpriv, args);
+
+	case HL_INFO_SYNC_MANAGER:
+		return sync_manager_info(hpriv, args);
+
+	case HL_INFO_TOTAL_ENERGY:
+		return total_energy_consumption_info(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@ -288,10 +288,10 @@ static void ext_queue_schedule_job(struct hl_cs_job *job)
 	ptr = cb->bus_address;

 	cq_pkt.data = cpu_to_le32(
-				((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
-					& CQ_ENTRY_SHADOW_INDEX_MASK) |
-				(1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT) |
-				(1 << CQ_ENTRY_READY_SHIFT));
+			((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
+				& CQ_ENTRY_SHADOW_INDEX_MASK) |
+			FIELD_PREP(CQ_ENTRY_SHADOW_INDEX_VALID_MASK, 1) |
+			FIELD_PREP(CQ_ENTRY_READY_MASK, 1));

 	/*
 	 * No need to protect pi_offset because scheduling to the
@ -474,7 +474,7 @@ static void init_signal_wait_cs(struct hl_cs *cs)
 		 * wait CS was submitted.
 		 */
 		mb();
-		dma_fence_put(cs->signal_fence);
+		hl_fence_put(cs->signal_fence);
 		cs->signal_fence = NULL;
 	}
 }
--- a/drivers/misc/habanalabs/common/hwmon.c
+++ b/drivers/misc/habanalabs/common/hwmon.c
@ -13,7 +13,7 @@
 #define HWMON_NR_SENSOR_TYPES		(hwmon_pwm + 1)

 int hl_build_hwmon_channel_info(struct hl_device *hdev,
-				struct armcp_sensor *sensors_arr)
+				struct cpucp_sensor *sensors_arr)
 {
 	u32 counts[HWMON_NR_SENSOR_TYPES] = {0};
 	u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL};
@ -24,7 +24,7 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 	enum hwmon_sensor_types type;
 	int rc, i, j;

-	for (i = 0 ; i < ARMCP_MAX_SENSORS ; i++) {
+	for (i = 0 ; i < CPUCP_MAX_SENSORS ; i++) {
 		type = le32_to_cpu(sensors_arr[i].type);

 		if ((type == 0) && (sensors_arr[i].flags == 0))
@ -311,13 +311,13 @@ static const struct hwmon_ops hl_hwmon_ops = {
 int hl_get_temperature(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@ -337,13 +337,13 @@ int hl_get_temperature(struct hl_device *hdev,
 int hl_set_temperature(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@ -362,13 +362,13 @@ int hl_set_temperature(struct hl_device *hdev,
 int hl_get_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@ -388,13 +388,13 @@ int hl_get_voltage(struct hl_device *hdev,
 int hl_get_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@ -414,13 +414,13 @@ int hl_get_current(struct hl_device *hdev,
 int hl_get_fan_speed(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_FAN_SPEED_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_FAN_SPEED_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@ -440,13 +440,13 @@ int hl_get_fan_speed(struct hl_device *hdev,
 int hl_get_pwm_info(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@ -466,13 +466,13 @@ int hl_get_pwm_info(struct hl_device *hdev,
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 			long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = cpu_to_le64(value);
@ -489,13 +489,13 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 int hl_set_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@ -514,13 +514,13 @@ int hl_set_voltage(struct hl_device *hdev,
 int hl_set_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@ -549,7 +549,7 @@ int hl_hwmon_init(struct hl_device *hdev)
 		hdev->hl_chip_info->ops = &hl_hwmon_ops;

 		hdev->hwmon_dev = hwmon_device_register_with_info(dev,
-					prop->armcp_info.card_name, hdev,
+					prop->cpucp_info.card_name, hdev,
 					hdev->hl_chip_info, NULL);
 		if (IS_ERR(hdev->hwmon_dev)) {
 			rc = PTR_ERR(hdev->hwmon_dev);
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@ -11,7 +11,7 @@

 /**
 * struct hl_eqe_work - This structure is used to schedule work of EQ
- *                      entry and armcp_reset event
+ *                      entry and cpucp_reset event
 *
 * @eq_work:          workqueue object to run when EQ entry is received
 * @hdev:             pointer to device structure
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@ -505,41 +505,32 @@ static inline int add_va_block(struct hl_device *hdev,
 }

 /*
- * get_va_block - get a virtual block with the requested size
- *
- * @hdev            : pointer to the habanalabs device structure
- * @va_range        : pointer to the virtual addresses range
- * @size            : requested block size
- * @hint_addr       : hint for request address by the user
- * @is_userptr      : is host or DRAM memory
+ * get_va_block() - get a virtual block for the given size and alignment.
+ * @hdev: pointer to the habanalabs device structure.
+ * @va_range: pointer to the virtual addresses range.
+ * @size: requested block size.
+ * @hint_addr: hint for requested address by the user.
+ * @va_block_align: required alignment of the virtual block start address.
 *
 * This function does the following:
 * - Iterate on the virtual block list to find a suitable virtual block for the
- *   requested size
- * - Reserve the requested block and update the list
- * - Return the start address of the virtual block
+ *   given size and alignment.
+ * - Reserve the requested block and update the list.
+ * - Return the start address of the virtual block.
 */
-static u64 get_va_block(struct hl_device *hdev,
-			struct hl_va_range *va_range, u64 size, u64 hint_addr,
-			bool is_userptr)
+static u64 get_va_block(struct hl_device *hdev, struct hl_va_range *va_range,
+			u64 size, u64 hint_addr, u32 va_block_align)
 {
 	struct hl_vm_va_block *va_block, *new_va_block = NULL;
-	u64 valid_start, valid_size, prev_start, prev_end, page_mask,
+	u64 valid_start, valid_size, prev_start, prev_end, align_mask,
 		res_valid_start = 0, res_valid_size = 0;
-	u32 page_size;
 	bool add_prev = false;

-	if (is_userptr)
-		/*
-		 * We cannot know if the user allocated memory with huge pages
-		 * or not, hence we continue with the biggest possible
-		 * granularity.
-		 */
-		page_size = hdev->asic_prop.pmmu_huge.page_size;
-	else
-		page_size = hdev->asic_prop.dmmu.page_size;
+	align_mask = ~((u64)va_block_align - 1);

-	page_mask = ~((u64)page_size - 1);
+	/* check if hint_addr is aligned */
+	if (hint_addr & (va_block_align - 1))
+		hint_addr = 0;

 	mutex_lock(&va_range->lock);

@ -549,9 +540,9 @@ static u64 get_va_block(struct hl_device *hdev,
 		/* calc the first possible aligned addr */
 		valid_start = va_block->start;

-		if (valid_start & (page_size - 1)) {
-			valid_start &= page_mask;
-			valid_start += page_size;
+		if (valid_start & (va_block_align - 1)) {
+			valid_start &= align_mask;
+			valid_start += va_block_align;
 			if (valid_start > va_block->end)
 				continue;
 		}
@ -863,7 +854,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	struct hl_va_range *va_range;
 	enum vm_type_t *vm_type;
 	u64 ret_vaddr, hint_addr;
-	u32 handle = 0;
+	u32 handle = 0, va_block_align;
 	int rc;
 	bool is_userptr = args->flags & HL_MEM_USERPTR;

@ -873,6 +864,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	if (is_userptr) {
 		u64 addr = args->map_host.host_virt_addr,
 			size = args->map_host.mem_size;
+		u32 page_size = hdev->asic_prop.pmmu.page_size,
+			huge_page_size = hdev->asic_prop.pmmu_huge.page_size;

 		rc = dma_map_host_va(hdev, addr, size, &userptr);
 		if (rc) {
@ -892,6 +885,27 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		vm_type = (enum vm_type_t *) userptr;
 		hint_addr = args->map_host.hint_addr;
 		handle = phys_pg_pack->handle;
+
+		/* get required alignment */
+		if (phys_pg_pack->page_size == page_size) {
+			va_range = ctx->host_va_range;
+
+			/*
+			 * huge page alignment may be needed in case of regular
+			 * page mapping, depending on the host VA alignment
+			 */
+			if (addr & (huge_page_size - 1))
+				va_block_align = page_size;
+			else
+				va_block_align = huge_page_size;
+		} else {
+			/*
+			 * huge page alignment is needed in case of huge page
+			 * mapping
+			 */
+			va_range = ctx->host_huge_va_range;
+			va_block_align = huge_page_size;
+		}
 	} else {
 		handle = lower_32_bits(args->map_device.handle);

@ -912,6 +926,10 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		vm_type = (enum vm_type_t *) phys_pg_pack;

 		hint_addr = args->map_device.hint_addr;
+
+		/* DRAM VA alignment is the same as the DRAM page size */
+		va_range = ctx->dram_va_range;
+		va_block_align = hdev->asic_prop.dmmu.page_size;
 	}

 	/*
@ -933,16 +951,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		goto hnode_err;
 	}

-	if (is_userptr)
-		if (phys_pg_pack->page_size == hdev->asic_prop.pmmu.page_size)
-			va_range = ctx->host_va_range;
-		else
-			va_range = ctx->host_huge_va_range;
-	else
-		va_range = ctx->dram_va_range;
-
 	ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
-					hint_addr, is_userptr);
+					hint_addr, va_block_align);
 	if (!ret_vaddr) {
 		dev_err(hdev->dev, "no available va block for handle %u\n",
 				handle);
--- a/drivers/misc/habanalabs/common/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu.c
@ -1,258 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0

 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2020 HabanaLabs, Ltd.
 * All Rights Reserved.
 */

-#include "habanalabs.h"
-#include "../include/hw_ip/mmu/mmu_general.h"
-
-#include <linux/genalloc.h>
 #include <linux/slab.h>

-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
-
-static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = NULL;
-
-	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
-				(unsigned long) hop_addr)
-		if (hop_addr == pgt_info->shadow_addr)
-			break;
-
-	return pgt_info;
-}
-
-static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
-{
-	struct hl_device *hdev = ctx->hdev;
-
-	gen_pool_free(hdev->mmu_pgt_pool, pgt_info->phys_addr,
-			hdev->asic_prop.mmu_hop_table_size);
-	hash_del(&pgt_info->node);
-	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
-	kfree(pgt_info);
-}
-
-static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-
-	_free_hop(ctx, pgt_info);
-}
-
-static u64 alloc_hop(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct pgt_info *pgt_info;
-	u64 phys_addr, shadow_addr;
-
-	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
-	if (!pgt_info)
-		return ULLONG_MAX;
-
-	phys_addr = (u64) gen_pool_alloc(hdev->mmu_pgt_pool,
-					prop->mmu_hop_table_size);
-	if (!phys_addr) {
-		dev_err(hdev->dev, "failed to allocate page\n");
-		goto pool_add_err;
-	}
-
-	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
-						GFP_KERNEL);
-	if (!shadow_addr)
-		goto shadow_err;
-
-	pgt_info->phys_addr = phys_addr;
-	pgt_info->shadow_addr = shadow_addr;
-	pgt_info->ctx = ctx;
-	pgt_info->num_of_ptes = 0;
-	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
-
-	return shadow_addr;
-
-shadow_err:
-	gen_pool_free(hdev->mmu_pgt_pool, phys_addr, prop->mmu_hop_table_size);
-pool_add_err:
-	kfree(pgt_info);
-
-	return ULLONG_MAX;
-}
-
-static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
-{
-	return ctx->hdev->asic_prop.mmu_pgt_addr +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static inline u64 get_hop0_addr(struct hl_ctx *ctx)
-{
-	return (u64) (uintptr_t) ctx->hdev->mmu_shadow_hop0 +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static inline void flush(struct hl_ctx *ctx)
-{
-	/* flush all writes from all cores to reach PCI */
-	mb();
-	ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
-}
-
-/* transform the value to physical address when writing to H/W */
-static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
-{
-	/*
-	 * The value to write is actually the address of the next shadow hop +
-	 * flags at the 12 LSBs.
-	 * Hence in order to get the value to write to the physical PTE, we
-	 * clear the 12 LSBs and translate the shadow hop to its associated
-	 * physical hop, and add back the original 12 LSBs.
-	 */
-	u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
-				(val & FLAGS_MASK);
-
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					phys_val);
-
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* do not transform the value to physical address when writing to H/W */
-static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
-					u64 val)
-{
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					val);
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* clear the last and present bits */
-static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
-{
-	/* no need to transform the value to physical address */
-	write_final_pte(ctx, pte_addr, 0);
-}
-
-static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	get_pgt_info(ctx, hop_addr)->num_of_ptes++;
-}
-
-/*
- * put_pte - decrement the num of ptes and free the hop if possible
- *
- * @ctx: pointer to the context structure
- * @hop_addr: addr of the hop
- *
- * This function returns the number of ptes left on this hop. If the number is
- * 0, it means the pte was freed.
- */
-static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-	int num_of_ptes_left;
-
-	pgt_info->num_of_ptes--;
-
-	/*
-	 * Need to save the number of ptes left because free_hop might free
-	 * the pgt_info
-	 */
-	num_of_ptes_left = pgt_info->num_of_ptes;
-	if (!num_of_ptes_left)
-		_free_hop(ctx, pgt_info);
-
-	return num_of_ptes_left;
-}
-
-static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-					u64 virt_addr, u64 mask, u64 shift)
-{
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & mask) >> shift);
-}
-
-static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
-					mmu_prop->hop0_shift);
-}
-
-static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
-					mmu_prop->hop1_shift);
-}
-
-static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
-					mmu_prop->hop2_shift);
-}
-
-static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
-					mmu_prop->hop3_shift);
-}
-
-static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
-					mmu_prop->hop4_shift);
-}
-
-static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
-{
-	if (curr_pte & PAGE_PRESENT_MASK)
-		return curr_pte & HOP_PHYS_ADDR_MASK;
-	else
-		return ULLONG_MAX;
-}
-
-static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
-						bool *is_new_hop)
-{
-	u64 hop_addr = get_next_hop_addr(ctx, curr_pte);
-
-	if (hop_addr == ULLONG_MAX) {
-		hop_addr = alloc_hop(ctx);
-		*is_new_hop = (hop_addr != ULLONG_MAX);
-	}
-
-	return hop_addr;
-}
-
-/* translates shadow address inside hop to a physical address */
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
-{
-	u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
-	u64 shadow_hop_addr = shadow_addr & ~page_mask;
-	u64 pte_offset = shadow_addr & page_mask;
-	u64 phys_hop_addr;
-
-	if (shadow_hop_addr != get_hop0_addr(ctx))
-		phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
-	else
-		phys_hop_addr = get_phys_hop0_addr(ctx);
-
-	return phys_hop_addr + pte_offset;
-}
+#include "habanalabs.h"

 static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
 {
@ -263,155 +18,6 @@ static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
 					prop->dmmu.end_addr);
 }

-static int dram_default_mapping_init(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
-		hop2_pte_addr, hop3_pte_addr, pte_val;
-	int rc, i, j, hop3_allocated = 0;
-
-	if ((!hdev->dram_supports_virtual_memory) ||
-			(!hdev->dram_default_page_mapping) ||
-			(ctx->asid == HL_KERNEL_ASID_ID))
-		return 0;
-
-	num_of_hop3 = prop->dram_size_for_default_page_mapping;
-	do_div(num_of_hop3, prop->dram_page_size);
-	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
-
-	/* add hop1 and hop2 */
-	total_hops = num_of_hop3 + 2;
-
-	ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops,  GFP_KERNEL);
-	if (!ctx->dram_default_hops)
-		return -ENOMEM;
-
-	hop0_addr = get_hop0_addr(ctx);
-
-	hop1_addr = alloc_hop(ctx);
-	if (hop1_addr == ULLONG_MAX) {
-		dev_err(hdev->dev, "failed to alloc hop 1\n");
-		rc = -ENOMEM;
-		goto hop1_err;
-	}
-
-	ctx->dram_default_hops[total_hops - 1] = hop1_addr;
-
-	hop2_addr = alloc_hop(ctx);
-	if (hop2_addr == ULLONG_MAX) {
-		dev_err(hdev->dev, "failed to alloc hop 2\n");
-		rc = -ENOMEM;
-		goto hop2_err;
-	}
-
-	ctx->dram_default_hops[total_hops - 2] = hop2_addr;
-
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		ctx->dram_default_hops[i] = alloc_hop(ctx);
-		if (ctx->dram_default_hops[i] == ULLONG_MAX) {
-			dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
-			rc = -ENOMEM;
-			goto hop3_err;
-		}
-		hop3_allocated++;
-	}
-
-	/* need only pte 0 in hops 0 and 1 */
-	pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop0_addr, pte_val);
-
-	pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop1_addr, pte_val);
-	get_pte(ctx, hop1_addr);
-
-	hop2_pte_addr = hop2_addr;
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
-				PAGE_PRESENT_MASK;
-		write_pte(ctx, hop2_pte_addr, pte_val);
-		get_pte(ctx, hop2_addr);
-		hop2_pte_addr += HL_PTE_SIZE;
-	}
-
-	pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) |
-			LAST_MASK | PAGE_PRESENT_MASK;
-
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		hop3_pte_addr = ctx->dram_default_hops[i];
-		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
-			write_final_pte(ctx, hop3_pte_addr, pte_val);
-			get_pte(ctx, ctx->dram_default_hops[i]);
-			hop3_pte_addr += HL_PTE_SIZE;
-		}
-	}
-
-	flush(ctx);
-
-	return 0;
-
-hop3_err:
-	for (i = 0 ; i < hop3_allocated ; i++)
-		free_hop(ctx, ctx->dram_default_hops[i]);
-
-	free_hop(ctx, hop2_addr);
-hop2_err:
-	free_hop(ctx, hop1_addr);
-hop1_err:
-	kfree(ctx->dram_default_hops);
-
-	return rc;
-}
-
-static void dram_default_mapping_fini(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
-		hop2_pte_addr, hop3_pte_addr;
-	int i, j;
-
-	if ((!hdev->dram_supports_virtual_memory) ||
-			(!hdev->dram_default_page_mapping) ||
-			(ctx->asid == HL_KERNEL_ASID_ID))
-		return;
-
-	num_of_hop3 = prop->dram_size_for_default_page_mapping;
-	do_div(num_of_hop3, prop->dram_page_size);
-	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
-
-	hop0_addr = get_hop0_addr(ctx);
-	/* add hop1 and hop2 */
-	total_hops = num_of_hop3 + 2;
-	hop1_addr = ctx->dram_default_hops[total_hops - 1];
-	hop2_addr = ctx->dram_default_hops[total_hops - 2];
-
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		hop3_pte_addr = ctx->dram_default_hops[i];
-		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
-			clear_pte(ctx, hop3_pte_addr);
-			put_pte(ctx, ctx->dram_default_hops[i]);
-			hop3_pte_addr += HL_PTE_SIZE;
-		}
-	}
-
-	hop2_pte_addr = hop2_addr;
-	hop2_pte_addr = hop2_addr;
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		clear_pte(ctx, hop2_pte_addr);
-		put_pte(ctx, hop2_addr);
-		hop2_pte_addr += HL_PTE_SIZE;
-	}
-
-	clear_pte(ctx, hop1_addr);
-	put_pte(ctx, hop1_addr);
-	clear_pte(ctx, hop0_addr);
-
-	kfree(ctx->dram_default_hops);
-
-	flush(ctx);
-}
-
 /**
 * hl_mmu_init() - initialize the MMU module.
 * @hdev: habanalabs device structure.
@ -424,45 +30,10 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 */
 int hl_mmu_init(struct hl_device *hdev)
 {
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	int rc;
-
-	if (!hdev->mmu_enable)
-		return 0;
-
-	hdev->mmu_pgt_pool =
-			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
-
-	if (!hdev->mmu_pgt_pool) {
-		dev_err(hdev->dev, "Failed to create page gen pool\n");
-		return -ENOMEM;
-	}
-
-	rc = gen_pool_add(hdev->mmu_pgt_pool, prop->mmu_pgt_addr +
-			prop->mmu_hop0_tables_total_size,
-			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
-			-1);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
-		goto err_pool_add;
-	}
-
-	hdev->mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
-					prop->mmu_hop_table_size,
-					GFP_KERNEL | __GFP_ZERO);
-	if (ZERO_OR_NULL_PTR(hdev->mmu_shadow_hop0)) {
-		rc = -ENOMEM;
-		goto err_pool_add;
-	}
-
-	/* MMU H/W init will be done in device hw_init() */
+	if (hdev->mmu_enable)
+		return hdev->mmu_func.init(hdev);

 	return 0;
-
-err_pool_add:
-	gen_pool_destroy(hdev->mmu_pgt_pool);
-
-	return rc;
 }

 /**
@ -477,13 +48,8 @@ int hl_mmu_init(struct hl_device *hdev)
 */
 void hl_mmu_fini(struct hl_device *hdev)
 {
-	if (!hdev->mmu_enable)
-		return;
-
-	/* MMU H/W fini was already done in device hw_fini() */
-
-	kvfree(hdev->mmu_shadow_hop0);
-	gen_pool_destroy(hdev->mmu_pgt_pool);
+	if (hdev->mmu_enable)
+		hdev->mmu_func.fini(hdev);
 }

 /**
@ -498,13 +64,10 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;

-	if (!hdev->mmu_enable)
-		return 0;
+	if (hdev->mmu_enable)
+		return hdev->mmu_func.ctx_init(ctx);

-	mutex_init(&ctx->mmu_lock);
-	hash_init(ctx->mmu_shadow_hash);
-
-	return dram_default_mapping_init(ctx);
+	return 0;
 }

 /*
@ -520,160 +83,9 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx)
 void hl_mmu_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
-	struct pgt_info *pgt_info;
-	struct hlist_node *tmp;
-	int i;

-	if (!hdev->mmu_enable)
-		return;
-
-	dram_default_mapping_fini(ctx);
-
-	if (!hash_empty(ctx->mmu_shadow_hash))
-		dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
-			ctx->asid);
-
-	hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
-		dev_err_ratelimited(hdev->dev,
-			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
-			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
-		_free_hop(ctx, pgt_info);
-	}
-
-	mutex_destroy(&ctx->mmu_lock);
-}
-
-static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct hl_mmu_properties *mmu_prop;
-	u64 hop0_addr = 0, hop0_pte_addr = 0,
-		hop1_addr = 0, hop1_pte_addr = 0,
-		hop2_addr = 0, hop2_pte_addr = 0,
-		hop3_addr = 0, hop3_pte_addr = 0,
-		hop4_addr = 0, hop4_pte_addr = 0,
-		curr_pte;
-	bool is_huge, clear_hop3 = true;
-
-	/* shifts and masks are the same in PMMU and HPMMU, use one of them */
-	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
-
-	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
-
-	hop1_addr = get_next_hop_addr(ctx, curr_pte);
-
-	if (hop1_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
-
-	hop2_addr = get_next_hop_addr(ctx, curr_pte);
-
-	if (hop2_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
-
-	hop3_addr = get_next_hop_addr(ctx, curr_pte);
-
-	if (hop3_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
-
-	is_huge = curr_pte & LAST_MASK;
-
-	if (is_dram_addr && !is_huge) {
-		dev_err(hdev->dev,
-				"DRAM unmapping should use huge pages only\n");
-		return -EFAULT;
-	}
-
-	if (!is_huge) {
-		hop4_addr = get_next_hop_addr(ctx, curr_pte);
-
-		if (hop4_addr == ULLONG_MAX)
-			goto not_mapped;
-
-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-
-		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
-
-		clear_hop3 = false;
-	}
-
-	if (hdev->dram_default_page_mapping && is_dram_addr) {
-		u64 default_pte = (prop->mmu_dram_default_page_addr &
-				HOP_PHYS_ADDR_MASK) | LAST_MASK |
-					PAGE_PRESENT_MASK;
-		if (curr_pte == default_pte) {
-			dev_err(hdev->dev,
-				"DRAM: hop3 PTE points to zero page, can't unmap, va: 0x%llx\n",
-					virt_addr);
-			goto not_mapped;
-		}
-
-		if (!(curr_pte & PAGE_PRESENT_MASK)) {
-			dev_err(hdev->dev,
-				"DRAM: hop3 PTE is cleared! can't unmap, va: 0x%llx\n",
-					virt_addr);
-			goto not_mapped;
-		}
-
-		write_final_pte(ctx, hop3_pte_addr, default_pte);
-		put_pte(ctx, hop3_addr);
-	} else {
-		if (!(curr_pte & PAGE_PRESENT_MASK))
-			goto not_mapped;
-
-		if (hop4_addr)
-			clear_pte(ctx, hop4_pte_addr);
-		else
-			clear_pte(ctx, hop3_pte_addr);
-
-		if (hop4_addr && !put_pte(ctx, hop4_addr))
-			clear_hop3 = true;
-
-		if (!clear_hop3)
-			goto mapped;
-
-		clear_pte(ctx, hop3_pte_addr);
-
-		if (put_pte(ctx, hop3_addr))
-			goto mapped;
-
-		clear_pte(ctx, hop2_pte_addr);
-
-		if (put_pte(ctx, hop2_addr))
-			goto mapped;
-
-		clear_pte(ctx, hop1_pte_addr);
-
-		if (put_pte(ctx, hop1_addr))
-			goto mapped;
-
-		clear_pte(ctx, hop0_pte_addr);
-	}
-
-mapped:
-	return 0;
-
-not_mapped:
-	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
-		virt_addr);
-
-	return -EINVAL;
+	if (hdev->mmu_enable)
+		hdev->mmu_func.ctx_fini(ctx);
 }

 /*
@ -738,7 +150,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	real_virt_addr = virt_addr;

 	for (i = 0 ; i < npages ; i++) {
-		rc = _hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr);
+		rc = hdev->mmu_func.unmap(ctx, real_virt_addr, is_dram_addr);
 		if (rc)
 			break;

@ -746,172 +158,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	}

 	if (flush_pte)
-		flush(ctx);
-
-	return rc;
-}
-
-static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
-			u32 page_size, bool is_dram_addr)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct hl_mmu_properties *mmu_prop;
-	u64 hop0_addr = 0, hop0_pte_addr = 0,
-		hop1_addr = 0, hop1_pte_addr = 0,
-		hop2_addr = 0, hop2_pte_addr = 0,
-		hop3_addr = 0, hop3_pte_addr = 0,
-		hop4_addr = 0, hop4_pte_addr = 0,
-		curr_pte = 0;
-	bool hop1_new = false, hop2_new = false, hop3_new = false,
-		hop4_new = false, is_huge;
-	int rc = -ENOMEM;
-
-	/*
-	 * This mapping function can map a page or a huge page. For huge page
-	 * there are only 3 hops rather than 4. Currently the DRAM allocation
-	 * uses huge pages only but user memory could have been allocated with
-	 * one of the two page sizes. Since this is a common code for all the
-	 * three cases, we need this hugs page check.
-	 */
-	if (is_dram_addr) {
-		mmu_prop = &prop->dmmu;
-		is_huge = true;
-	} else if (page_size == prop->pmmu_huge.page_size) {
-		mmu_prop = &prop->pmmu_huge;
-		is_huge = true;
-	} else {
-		mmu_prop = &prop->pmmu;
-		is_huge = false;
-	}
-
-	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
-
-	hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
-	if (hop1_addr == ULLONG_MAX)
-		goto err;
-
-	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
-
-	hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
-	if (hop2_addr == ULLONG_MAX)
-		goto err;
-
-	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
-
-	hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
-	if (hop3_addr == ULLONG_MAX)
-		goto err;
-
-	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
-
-	if (!is_huge) {
-		hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new);
-		if (hop4_addr == ULLONG_MAX)
-			goto err;
-
-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
-	}
-
-	if (hdev->dram_default_page_mapping && is_dram_addr) {
-		u64 default_pte = (prop->mmu_dram_default_page_addr &
-					HOP_PHYS_ADDR_MASK) | LAST_MASK |
-						PAGE_PRESENT_MASK;
-
-		if (curr_pte != default_pte) {
-			dev_err(hdev->dev,
-				"DRAM: mapping already exists for virt_addr 0x%llx\n",
-					virt_addr);
-			rc = -EINVAL;
-			goto err;
-		}
-
-		if (hop1_new || hop2_new || hop3_new || hop4_new) {
-			dev_err(hdev->dev,
-				"DRAM mapping should not allocate more hops\n");
-			rc = -EFAULT;
-			goto err;
-		}
-	} else if (curr_pte & PAGE_PRESENT_MASK) {
-		dev_err(hdev->dev,
-			"mapping already exists for virt_addr 0x%llx\n",
-				virt_addr);
-
-		dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop0_pte_addr, hop0_pte_addr);
-		dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop1_pte_addr, hop1_pte_addr);
-		dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop2_pte_addr, hop2_pte_addr);
-		dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop3_pte_addr, hop3_pte_addr);
-
-		if (!is_huge)
-			dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
-				*(u64 *) (uintptr_t) hop4_pte_addr,
-				hop4_pte_addr);
-
-		rc = -EINVAL;
-		goto err;
-	}
-
-	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
-			| PAGE_PRESENT_MASK;
-
-	if (is_huge)
-		write_final_pte(ctx, hop3_pte_addr, curr_pte);
-	else
-		write_final_pte(ctx, hop4_pte_addr, curr_pte);
-
-	if (hop1_new) {
-		curr_pte =
-			(hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop0_pte_addr, curr_pte);
-	}
-	if (hop2_new) {
-		curr_pte =
-			(hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop1_pte_addr, curr_pte);
-		get_pte(ctx, hop1_addr);
-	}
-	if (hop3_new) {
-		curr_pte =
-			(hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop2_pte_addr, curr_pte);
-		get_pte(ctx, hop2_addr);
-	}
-
-	if (!is_huge) {
-		if (hop4_new) {
-			curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
-					PAGE_PRESENT_MASK;
-			write_pte(ctx, hop3_pte_addr, curr_pte);
-			get_pte(ctx, hop3_addr);
-		}
-
-		get_pte(ctx, hop4_addr);
-	} else {
-		get_pte(ctx, hop3_addr);
-	}
-
-	return 0;
-
-err:
-	if (hop4_new)
-		free_hop(ctx, hop4_addr);
-	if (hop3_new)
-		free_hop(ctx, hop3_addr);
-	if (hop2_new)
-		free_hop(ctx, hop2_addr);
-	if (hop1_new)
-		free_hop(ctx, hop1_addr);
+		hdev->mmu_func.flush(ctx);

 	return rc;
 }
@ -984,7 +231,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 	real_phys_addr = phys_addr;

 	for (i = 0 ; i < npages ; i++) {
-		rc = _hl_mmu_map(ctx, real_virt_addr, real_phys_addr,
+		rc = hdev->mmu_func.map(ctx, real_virt_addr, real_phys_addr,
 				real_page_size, is_dram_addr);
 		if (rc)
 			goto err;
@ -995,21 +242,21 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 	}

 	if (flush_pte)
-		flush(ctx);
+		hdev->mmu_func.flush(ctx);

 	return 0;

 err:
 	real_virt_addr = virt_addr;
 	for (i = 0 ; i < mapped_cnt ; i++) {
-		if (_hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr))
+		if (hdev->mmu_func.unmap(ctx, real_virt_addr, is_dram_addr))
 			dev_warn_ratelimited(hdev->dev,
 				"failed to unmap va: 0x%llx\n", real_virt_addr);

 		real_virt_addr += real_page_size;
 	}

-	flush(ctx);
+	hdev->mmu_func.flush(ctx);

 	return rc;
 }
@ -1022,7 +269,10 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 */
 void hl_mmu_swap_out(struct hl_ctx *ctx)
 {
+	struct hl_device *hdev = ctx->hdev;

+	if (hdev->mmu_enable)
+		hdev->mmu_func.swap_out(ctx);
 }

 /*
@ -1033,5 +283,27 @@ void hl_mmu_swap_out(struct hl_ctx *ctx)
 */
 void hl_mmu_swap_in(struct hl_ctx *ctx)
 {
+	struct hl_device *hdev = ctx->hdev;

+	if (hdev->mmu_enable)
+		hdev->mmu_func.swap_in(ctx);
+}
+
+int hl_mmu_if_set_funcs(struct hl_device *hdev)
+{
+	if (!hdev->mmu_enable)
+		return 0;
+
+	switch (hdev->asic_type) {
+	case ASIC_GOYA:
+	case ASIC_GAUDI:
+		hl_mmu_v1_set_funcs(hdev);
+		break;
+	default:
+		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
+			hdev->asic_type);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
 }
--- a/drivers/misc/habanalabs/common/mmu_v1.c
+++ b/drivers/misc/habanalabs/common/mmu_v1.c
@ -0,0 +1,863 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+#include "../include/hw_ip/mmu/mmu_general.h"
+
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+
+static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
+
+static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = NULL;
+
+	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
+				(unsigned long) hop_addr)
+		if (hop_addr == pgt_info->shadow_addr)
+			break;
+
+	return pgt_info;
+}
+
+static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	gen_pool_free(hdev->mmu_priv.mmu_pgt_pool, pgt_info->phys_addr,
+			hdev->asic_prop.mmu_hop_table_size);
+	hash_del(&pgt_info->node);
+	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
+	kfree(pgt_info);
+}
+
+static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
+
+	_free_hop(ctx, pgt_info);
+}
+
+static u64 alloc_hop(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct pgt_info *pgt_info;
+	u64 phys_addr, shadow_addr;
+
+	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
+	if (!pgt_info)
+		return ULLONG_MAX;
+
+	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.mmu_pgt_pool,
+					prop->mmu_hop_table_size);
+	if (!phys_addr) {
+		dev_err(hdev->dev, "failed to allocate page\n");
+		goto pool_add_err;
+	}
+
+	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
+						GFP_KERNEL);
+	if (!shadow_addr)
+		goto shadow_err;
+
+	pgt_info->phys_addr = phys_addr;
+	pgt_info->shadow_addr = shadow_addr;
+	pgt_info->ctx = ctx;
+	pgt_info->num_of_ptes = 0;
+	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
+
+	return shadow_addr;
+
+shadow_err:
+	gen_pool_free(hdev->mmu_priv.mmu_pgt_pool, phys_addr,
+			prop->mmu_hop_table_size);
+pool_add_err:
+	kfree(pgt_info);
+
+	return ULLONG_MAX;
+}
+
+static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
+{
+	return ctx->hdev->asic_prop.mmu_pgt_addr +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+static inline u64 get_hop0_addr(struct hl_ctx *ctx)
+{
+	return (u64) (uintptr_t) ctx->hdev->mmu_priv.mmu_shadow_hop0 +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+static void flush(struct hl_ctx *ctx)
+{
+	/* flush all writes from all cores to reach PCI */
+	mb();
+	ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
+}
+
+/* transform the value to physical address when writing to H/W */
+static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
+{
+	/*
+	 * The value to write is actually the address of the next shadow hop +
+	 * flags at the 12 LSBs.
+	 * Hence in order to get the value to write to the physical PTE, we
+	 * clear the 12 LSBs and translate the shadow hop to its associated
+	 * physical hop, and add back the original 12 LSBs.
+	 */
+	u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
+				(val & FLAGS_MASK);
+
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
+					get_phys_addr(ctx, shadow_pte_addr),
+					phys_val);
+
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+/* do not transform the value to physical address when writing to H/W */
+static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
+					u64 val)
+{
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
+					get_phys_addr(ctx, shadow_pte_addr),
+					val);
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+/* clear the last and present bits */
+static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
+{
+	/* no need to transform the value to physical address */
+	write_final_pte(ctx, pte_addr, 0);
+}
+
+static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	get_pgt_info(ctx, hop_addr)->num_of_ptes++;
+}
+
+/*
+ * put_pte - decrement the num of ptes and free the hop if possible
+ *
+ * @ctx: pointer to the context structure
+ * @hop_addr: addr of the hop
+ *
+ * This function returns the number of ptes left on this hop. If the number is
+ * 0, it means the pte was freed.
+ */
+static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
+	int num_of_ptes_left;
+
+	pgt_info->num_of_ptes--;
+
+	/*
+	 * Need to save the number of ptes left because free_hop might free
+	 * the pgt_info
+	 */
+	num_of_ptes_left = pgt_info->num_of_ptes;
+	if (!num_of_ptes_left)
+		_free_hop(ctx, pgt_info);
+
+	return num_of_ptes_left;
+}
+
+static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
+					u64 virt_addr, u64 mask, u64 shift)
+{
+	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
+			((virt_addr & mask) >> shift);
+}
+
+static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
+					mmu_prop->hop0_shift);
+}
+
+static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
+					mmu_prop->hop1_shift);
+}
+
+static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
+					mmu_prop->hop2_shift);
+}
+
+static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
+					mmu_prop->hop3_shift);
+}
+
+static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
+					mmu_prop->hop4_shift);
+}
+
+static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
+{
+	if (curr_pte & PAGE_PRESENT_MASK)
+		return curr_pte & HOP_PHYS_ADDR_MASK;
+	else
+		return ULLONG_MAX;
+}
+
+static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
+						bool *is_new_hop)
+{
+	u64 hop_addr = get_next_hop_addr(ctx, curr_pte);
+
+	if (hop_addr == ULLONG_MAX) {
+		hop_addr = alloc_hop(ctx);
+		*is_new_hop = (hop_addr != ULLONG_MAX);
+	}
+
+	return hop_addr;
+}
+
+/* translates shadow address inside hop to a physical address */
+static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
+{
+	u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
+	u64 shadow_hop_addr = shadow_addr & ~page_mask;
+	u64 pte_offset = shadow_addr & page_mask;
+	u64 phys_hop_addr;
+
+	if (shadow_hop_addr != get_hop0_addr(ctx))
+		phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
+	else
+		phys_hop_addr = get_phys_hop0_addr(ctx);
+
+	return phys_hop_addr + pte_offset;
+}
+
+static int dram_default_mapping_init(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
+		hop2_pte_addr, hop3_pte_addr, pte_val;
+	int rc, i, j, hop3_allocated = 0;
+
+	if ((!hdev->dram_supports_virtual_memory) ||
+			(!hdev->dram_default_page_mapping) ||
+			(ctx->asid == HL_KERNEL_ASID_ID))
+		return 0;
+
+	num_of_hop3 = prop->dram_size_for_default_page_mapping;
+	do_div(num_of_hop3, prop->dram_page_size);
+	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
+
+	/* add hop1 and hop2 */
+	total_hops = num_of_hop3 + 2;
+
+	ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops,  GFP_KERNEL);
+	if (!ctx->dram_default_hops)
+		return -ENOMEM;
+
+	hop0_addr = get_hop0_addr(ctx);
+
+	hop1_addr = alloc_hop(ctx);
+	if (hop1_addr == ULLONG_MAX) {
+		dev_err(hdev->dev, "failed to alloc hop 1\n");
+		rc = -ENOMEM;
+		goto hop1_err;
+	}
+
+	ctx->dram_default_hops[total_hops - 1] = hop1_addr;
+
+	hop2_addr = alloc_hop(ctx);
+	if (hop2_addr == ULLONG_MAX) {
+		dev_err(hdev->dev, "failed to alloc hop 2\n");
+		rc = -ENOMEM;
+		goto hop2_err;
+	}
+
+	ctx->dram_default_hops[total_hops - 2] = hop2_addr;
+
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		ctx->dram_default_hops[i] = alloc_hop(ctx);
+		if (ctx->dram_default_hops[i] == ULLONG_MAX) {
+			dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
+			rc = -ENOMEM;
+			goto hop3_err;
+		}
+		hop3_allocated++;
+	}
+
+	/* need only pte 0 in hops 0 and 1 */
+	pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+	write_pte(ctx, hop0_addr, pte_val);
+
+	pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+	write_pte(ctx, hop1_addr, pte_val);
+	get_pte(ctx, hop1_addr);
+
+	hop2_pte_addr = hop2_addr;
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
+				PAGE_PRESENT_MASK;
+		write_pte(ctx, hop2_pte_addr, pte_val);
+		get_pte(ctx, hop2_addr);
+		hop2_pte_addr += HL_PTE_SIZE;
+	}
+
+	pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) |
+			LAST_MASK | PAGE_PRESENT_MASK;
+
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		hop3_pte_addr = ctx->dram_default_hops[i];
+		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
+			write_final_pte(ctx, hop3_pte_addr, pte_val);
+			get_pte(ctx, ctx->dram_default_hops[i]);
+			hop3_pte_addr += HL_PTE_SIZE;
+		}
+	}
+
+	flush(ctx);
+
+	return 0;
+
+hop3_err:
+	for (i = 0 ; i < hop3_allocated ; i++)
+		free_hop(ctx, ctx->dram_default_hops[i]);
+
+	free_hop(ctx, hop2_addr);
+hop2_err:
+	free_hop(ctx, hop1_addr);
+hop1_err:
+	kfree(ctx->dram_default_hops);
+
+	return rc;
+}
+
+static void dram_default_mapping_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
+		hop2_pte_addr, hop3_pte_addr;
+	int i, j;
+
+	if ((!hdev->dram_supports_virtual_memory) ||
+			(!hdev->dram_default_page_mapping) ||
+			(ctx->asid == HL_KERNEL_ASID_ID))
+		return;
+
+	num_of_hop3 = prop->dram_size_for_default_page_mapping;
+	do_div(num_of_hop3, prop->dram_page_size);
+	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
+
+	hop0_addr = get_hop0_addr(ctx);
+	/* add hop1 and hop2 */
+	total_hops = num_of_hop3 + 2;
+	hop1_addr = ctx->dram_default_hops[total_hops - 1];
+	hop2_addr = ctx->dram_default_hops[total_hops - 2];
+
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		hop3_pte_addr = ctx->dram_default_hops[i];
+		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
+			clear_pte(ctx, hop3_pte_addr);
+			put_pte(ctx, ctx->dram_default_hops[i]);
+			hop3_pte_addr += HL_PTE_SIZE;
+		}
+	}
+
+	hop2_pte_addr = hop2_addr;
+	hop2_pte_addr = hop2_addr;
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		clear_pte(ctx, hop2_pte_addr);
+		put_pte(ctx, hop2_addr);
+		hop2_pte_addr += HL_PTE_SIZE;
+	}
+
+	clear_pte(ctx, hop1_addr);
+	put_pte(ctx, hop1_addr);
+	clear_pte(ctx, hop0_addr);
+
+	kfree(ctx->dram_default_hops);
+
+	flush(ctx);
+}
+
+/**
+ * hl_mmu_v1_init() - initialize the MMU module.
+ * @hdev: habanalabs device structure.
+ *
+ * This function does the following:
+ * - Create a pool of pages for pgt_infos.
+ * - Create a shadow table for pgt
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+static int hl_mmu_v1_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	hdev->mmu_priv.mmu_pgt_pool =
+			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
+
+	if (!hdev->mmu_priv.mmu_pgt_pool) {
+		dev_err(hdev->dev, "Failed to create page gen pool\n");
+		return -ENOMEM;
+	}
+
+	rc = gen_pool_add(hdev->mmu_priv.mmu_pgt_pool, prop->mmu_pgt_addr +
+			prop->mmu_hop0_tables_total_size,
+			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
+			-1);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
+		goto err_pool_add;
+	}
+
+	hdev->mmu_priv.mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
+						prop->mmu_hop_table_size,
+						GFP_KERNEL | __GFP_ZERO);
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.mmu_shadow_hop0)) {
+		rc = -ENOMEM;
+		goto err_pool_add;
+	}
+
+	/* MMU H/W init will be done in device hw_init() */
+
+	return 0;
+
+err_pool_add:
+	gen_pool_destroy(hdev->mmu_priv.mmu_pgt_pool);
+
+	return rc;
+}
+
+/**
+ * hl_mmu_fini() - release the MMU module.
+ * @hdev: habanalabs device structure.
+ *
+ * This function does the following:
+ * - Disable MMU in H/W.
+ * - Free the pgt_infos pool.
+ *
+ * All contexts should be freed before calling this function.
+ */
+static void hl_mmu_v1_fini(struct hl_device *hdev)
+{
+	/* MMU H/W fini was already done in device hw_fini() */
+
+	kvfree(hdev->mmu_priv.mmu_shadow_hop0);
+	gen_pool_destroy(hdev->mmu_priv.mmu_pgt_pool);
+}
+
+/**
+ * hl_mmu_ctx_init() - initialize a context for using the MMU module.
+ * @ctx: pointer to the context structure to initialize.
+ *
+ * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all
+ * page tables hops related to this context.
+ * Return: 0 on success, non-zero otherwise.
+ */
+static int hl_mmu_v1_ctx_init(struct hl_ctx *ctx)
+{
+	mutex_init(&ctx->mmu_lock);
+	hash_init(ctx->mmu_shadow_hash);
+
+	return dram_default_mapping_init(ctx);
+}
+
+/*
+ * hl_mmu_ctx_fini - disable a ctx from using the mmu module
+ *
+ * @ctx: pointer to the context structure
+ *
+ * This function does the following:
+ * - Free any pgts which were not freed yet
+ * - Free the mutex
+ * - Free DRAM default page mapping hops
+ */
+static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct pgt_info *pgt_info;
+	struct hlist_node *tmp;
+	int i;
+
+	dram_default_mapping_fini(ctx);
+
+	if (!hash_empty(ctx->mmu_shadow_hash))
+		dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
+			ctx->asid);
+
+	hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
+		dev_err_ratelimited(hdev->dev,
+			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
+			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
+		_free_hop(ctx, pgt_info);
+	}
+
+	mutex_destroy(&ctx->mmu_lock);
+}
+
+static int _hl_mmu_v1_unmap(struct hl_ctx *ctx,
+				u64 virt_addr, bool is_dram_addr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
+	u64 hop0_addr = 0, hop0_pte_addr = 0,
+		hop1_addr = 0, hop1_pte_addr = 0,
+		hop2_addr = 0, hop2_pte_addr = 0,
+		hop3_addr = 0, hop3_pte_addr = 0,
+		hop4_addr = 0, hop4_pte_addr = 0,
+		curr_pte;
+	bool is_huge, clear_hop3 = true;
+
+	/* shifts and masks are the same in PMMU and HPMMU, use one of them */
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
+	hop0_addr = get_hop0_addr(ctx);
+	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
+
+	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
+
+	hop1_addr = get_next_hop_addr(ctx, curr_pte);
+
+	if (hop1_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
+
+	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
+
+	hop2_addr = get_next_hop_addr(ctx, curr_pte);
+
+	if (hop2_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
+
+	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
+
+	hop3_addr = get_next_hop_addr(ctx, curr_pte);
+
+	if (hop3_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
+
+	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
+
+	is_huge = curr_pte & LAST_MASK;
+
+	if (is_dram_addr && !is_huge) {
+		dev_err(hdev->dev,
+				"DRAM unmapping should use huge pages only\n");
+		return -EFAULT;
+	}
+
+	if (!is_huge) {
+		hop4_addr = get_next_hop_addr(ctx, curr_pte);
+
+		if (hop4_addr == ULLONG_MAX)
+			goto not_mapped;
+
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
+							virt_addr);
+
+		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
+
+		clear_hop3 = false;
+	}
+
+	if (hdev->dram_default_page_mapping && is_dram_addr) {
+		u64 default_pte = (prop->mmu_dram_default_page_addr &
+				HOP_PHYS_ADDR_MASK) | LAST_MASK |
+					PAGE_PRESENT_MASK;
+		if (curr_pte == default_pte) {
+			dev_err(hdev->dev,
+				"DRAM: hop3 PTE points to zero page, can't unmap, va: 0x%llx\n",
+					virt_addr);
+			goto not_mapped;
+		}
+
+		if (!(curr_pte & PAGE_PRESENT_MASK)) {
+			dev_err(hdev->dev,
+				"DRAM: hop3 PTE is cleared! can't unmap, va: 0x%llx\n",
+					virt_addr);
+			goto not_mapped;
+		}
+
+		write_final_pte(ctx, hop3_pte_addr, default_pte);
+		put_pte(ctx, hop3_addr);
+	} else {
+		if (!(curr_pte & PAGE_PRESENT_MASK))
+			goto not_mapped;
+
+		if (hop4_addr)
+			clear_pte(ctx, hop4_pte_addr);
+		else
+			clear_pte(ctx, hop3_pte_addr);
+
+		if (hop4_addr && !put_pte(ctx, hop4_addr))
+			clear_hop3 = true;
+
+		if (!clear_hop3)
+			goto mapped;
+
+		clear_pte(ctx, hop3_pte_addr);
+
+		if (put_pte(ctx, hop3_addr))
+			goto mapped;
+
+		clear_pte(ctx, hop2_pte_addr);
+
+		if (put_pte(ctx, hop2_addr))
+			goto mapped;
+
+		clear_pte(ctx, hop1_pte_addr);
+
+		if (put_pte(ctx, hop1_addr))
+			goto mapped;
+
+		clear_pte(ctx, hop0_pte_addr);
+	}
+
+mapped:
+	return 0;
+
+not_mapped:
+	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
+		virt_addr);
+
+	return -EINVAL;
+}
+
+static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+			u32 page_size, bool is_dram_addr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
+	u64 hop0_addr = 0, hop0_pte_addr = 0,
+		hop1_addr = 0, hop1_pte_addr = 0,
+		hop2_addr = 0, hop2_pte_addr = 0,
+		hop3_addr = 0, hop3_pte_addr = 0,
+		hop4_addr = 0, hop4_pte_addr = 0,
+		curr_pte = 0;
+	bool hop1_new = false, hop2_new = false, hop3_new = false,
+		hop4_new = false, is_huge;
+	int rc = -ENOMEM;
+
+	/*
+	 * This mapping function can map a page or a huge page. For huge page
+	 * there are only 3 hops rather than 4. Currently the DRAM allocation
+	 * uses huge pages only but user memory could have been allocated with
+	 * one of the two page sizes. Since this is a common code for all the
+	 * three cases, we need this hugs page check.
+	 */
+	if (is_dram_addr) {
+		mmu_prop = &prop->dmmu;
+		is_huge = true;
+	} else if (page_size == prop->pmmu_huge.page_size) {
+		mmu_prop = &prop->pmmu_huge;
+		is_huge = true;
+	} else {
+		mmu_prop = &prop->pmmu;
+		is_huge = false;
+	}
+
+	hop0_addr = get_hop0_addr(ctx);
+	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
+
+	hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
+	if (hop1_addr == ULLONG_MAX)
+		goto err;
+
+	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
+
+	hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
+	if (hop2_addr == ULLONG_MAX)
+		goto err;
+
+	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
+
+	hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
+	if (hop3_addr == ULLONG_MAX)
+		goto err;
+
+	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
+
+	if (!is_huge) {
+		hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new);
+		if (hop4_addr == ULLONG_MAX)
+			goto err;
+
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
+							virt_addr);
+		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
+	}
+
+	if (hdev->dram_default_page_mapping && is_dram_addr) {
+		u64 default_pte = (prop->mmu_dram_default_page_addr &
+					HOP_PHYS_ADDR_MASK) | LAST_MASK |
+						PAGE_PRESENT_MASK;
+
+		if (curr_pte != default_pte) {
+			dev_err(hdev->dev,
+				"DRAM: mapping already exists for virt_addr 0x%llx\n",
+					virt_addr);
+			rc = -EINVAL;
+			goto err;
+		}
+
+		if (hop1_new || hop2_new || hop3_new || hop4_new) {
+			dev_err(hdev->dev,
+				"DRAM mapping should not allocate more hops\n");
+			rc = -EFAULT;
+			goto err;
+		}
+	} else if (curr_pte & PAGE_PRESENT_MASK) {
+		dev_err(hdev->dev,
+			"mapping already exists for virt_addr 0x%llx\n",
+				virt_addr);
+
+		dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
+			*(u64 *) (uintptr_t) hop0_pte_addr, hop0_pte_addr);
+		dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
+			*(u64 *) (uintptr_t) hop1_pte_addr, hop1_pte_addr);
+		dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
+			*(u64 *) (uintptr_t) hop2_pte_addr, hop2_pte_addr);
+		dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
+			*(u64 *) (uintptr_t) hop3_pte_addr, hop3_pte_addr);
+
+		if (!is_huge)
+			dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
+				*(u64 *) (uintptr_t) hop4_pte_addr,
+				hop4_pte_addr);
+
+		rc = -EINVAL;
+		goto err;
+	}
+
+	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
+			| PAGE_PRESENT_MASK;
+
+	if (is_huge)
+		write_final_pte(ctx, hop3_pte_addr, curr_pte);
+	else
+		write_final_pte(ctx, hop4_pte_addr, curr_pte);
+
+	if (hop1_new) {
+		curr_pte =
+			(hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+		write_pte(ctx, hop0_pte_addr, curr_pte);
+	}
+	if (hop2_new) {
+		curr_pte =
+			(hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+		write_pte(ctx, hop1_pte_addr, curr_pte);
+		get_pte(ctx, hop1_addr);
+	}
+	if (hop3_new) {
+		curr_pte =
+			(hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+		write_pte(ctx, hop2_pte_addr, curr_pte);
+		get_pte(ctx, hop2_addr);
+	}
+
+	if (!is_huge) {
+		if (hop4_new) {
+			curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
+					PAGE_PRESENT_MASK;
+			write_pte(ctx, hop3_pte_addr, curr_pte);
+			get_pte(ctx, hop3_addr);
+		}
+
+		get_pte(ctx, hop4_addr);
+	} else {
+		get_pte(ctx, hop3_addr);
+	}
+
+	return 0;
+
+err:
+	if (hop4_new)
+		free_hop(ctx, hop4_addr);
+	if (hop3_new)
+		free_hop(ctx, hop3_addr);
+	if (hop2_new)
+		free_hop(ctx, hop2_addr);
+	if (hop1_new)
+		free_hop(ctx, hop1_addr);
+
+	return rc;
+}
+
+/*
+ * hl_mmu_v1_swap_out - marks all mapping of the given ctx as swapped out
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v1_swap_out(struct hl_ctx *ctx)
+{
+
+}
+
+/*
+ * hl_mmu_v1_swap_in - marks all mapping of the given ctx as swapped in
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v1_swap_in(struct hl_ctx *ctx)
+{
+
+}
+
+/*
+ * hl_mmu_v1_prepare - prepare mmu  for working with mmu v1
+ *
+ * @hdev: pointer to the device structure
+ */
+void hl_mmu_v1_set_funcs(struct hl_device *hdev)
+{
+	struct hl_mmu_funcs *mmu = &hdev->mmu_func;
+
+	mmu->init = hl_mmu_v1_init;
+	mmu->fini = hl_mmu_v1_fini;
+	mmu->ctx_init = hl_mmu_v1_ctx_init;
+	mmu->ctx_fini = hl_mmu_v1_ctx_fini;
+	mmu->map = _hl_mmu_v1_map;
+	mmu->unmap = _hl_mmu_v1_unmap;
+	mmu->flush = flush;
+	mmu->swap_out = hl_mmu_v1_swap_out;
+	mmu->swap_in = hl_mmu_v1_swap_in;
+}
--- a/drivers/misc/habanalabs/common/pci.c
+++ b/drivers/misc/habanalabs/common/pci.c
@ -9,7 +9,6 @@
 #include "../include/hw_ip/pci/pci_general.h"

 #include <linux/pci.h>
-#include <linux/bitfield.h>

 #define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC	(HL_PCI_ELBI_TIMEOUT_MSEC * 10)

@ -339,12 +338,17 @@ static int hl_pci_set_dma_mask(struct hl_device *hdev)
 /**
 * hl_pci_init() - PCI initialization code.
 * @hdev: Pointer to hl_device structure.
+ * @cpu_boot_status_reg: status register of the device's CPU
+ * @boot_err0_reg: boot error register of the device's CPU
+ * @preboot_ver_timeout: how much to wait before bailing out on reading
+ *                       the preboot version
 *
 * Set DMA masks, initialize the PCI controller and map the PCI BARs.
 *
 * Return: 0 on success, non-zero for failure.
 */
-int hl_pci_init(struct hl_device *hdev)
+int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
+		u32 boot_err0_reg, u32 preboot_ver_timeout)
 {
 	struct pci_dev *pdev = hdev->pdev;
 	int rc;
@ -376,6 +380,15 @@ int hl_pci_init(struct hl_device *hdev)
 	if (rc)
 		goto unmap_pci_bars;

+	/* Before continuing in the initialization, we need to read the preboot
+	 * version to determine whether we run with a security-enabled firmware
+	 * The check will be done in each ASIC's specific code
+	 */
+	rc = hl_fw_read_preboot_ver(hdev, cpu_boot_status_reg, boot_err0_reg,
+					preboot_ver_timeout);
+	if (rc)
+		goto unmap_pci_bars;
+
 	return 0;

 unmap_pci_bars:
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@ -11,18 +11,18 @@

 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

 	if (curr)
-		pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_CURR_GET <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_CURR_GET <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);
 	else
-		pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_GET <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_GET <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.pll_index = cpu_to_le32(pll_index);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@ -40,13 +40,13 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)

 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_SET <<
-					ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_SET <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.pll_index = cpu_to_le32(pll_index);
 	pkt.value = cpu_to_le64(freq);

@ -61,14 +61,14 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)

 u64 hl_get_max_power(struct hl_device *hdev)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, &result);
@ -83,13 +83,13 @@ u64 hl_get_max_power(struct hl_device *hdev)

 void hl_set_max_power(struct hl_device *hdev)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.value = cpu_to_le64(hdev->max_power);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@ -112,7 +112,7 @@ static ssize_t armcp_kernel_ver_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	return sprintf(buf, "%s", hdev->asic_prop.armcp_info.kernel_version);
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version);
 }

 static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr,
@ -120,7 +120,7 @@ static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	return sprintf(buf, "%s\n", hdev->asic_prop.armcp_info.armcp_version);
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version);
 }

 static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
@ -129,7 +129,23 @@ static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);

 	return sprintf(buf, "0x%08x\n",
-			hdev->asic_prop.armcp_info.cpld_version);
+			hdev->asic_prop.cpucp_info.cpld_version);
+}
+
+static ssize_t cpucp_kernel_ver_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version);
+}
+
+static ssize_t cpucp_ver_show(struct device *dev, struct device_attribute *attr,
+				char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version);
 }

 static ssize_t infineon_ver_show(struct device *dev,
@ -138,7 +154,7 @@ static ssize_t infineon_ver_show(struct device *dev,
 	struct hl_device *hdev = dev_get_drvdata(dev);

 	return sprintf(buf, "0x%04x\n",
-			hdev->asic_prop.armcp_info.infineon_version);
+			hdev->asic_prop.cpucp_info.infineon_version);
 }

 static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
@ -146,7 +162,7 @@ static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	return sprintf(buf, "%s\n", hdev->asic_prop.armcp_info.fuse_version);
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.fuse_version);
 }

 static ssize_t thermal_ver_show(struct device *dev,
@ -154,7 +170,7 @@ static ssize_t thermal_ver_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	return sprintf(buf, "%s", hdev->asic_prop.armcp_info.thermal_version);
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.thermal_version);
 }

 static ssize_t preboot_btl_ver_show(struct device *dev,
@ -356,6 +372,8 @@ static ssize_t eeprom_read_handler(struct file *filp, struct kobject *kobj,
 static DEVICE_ATTR_RO(armcp_kernel_ver);
 static DEVICE_ATTR_RO(armcp_ver);
 static DEVICE_ATTR_RO(cpld_ver);
+static DEVICE_ATTR_RO(cpucp_kernel_ver);
+static DEVICE_ATTR_RO(cpucp_ver);
 static DEVICE_ATTR_RO(device_type);
 static DEVICE_ATTR_RO(fuse_ver);
 static DEVICE_ATTR_WO(hard_reset);
@ -380,6 +398,8 @@ static struct attribute *hl_dev_attrs[] = {
 	&dev_attr_armcp_kernel_ver.attr,
 	&dev_attr_armcp_ver.attr,
 	&dev_attr_cpld_ver.attr,
+	&dev_attr_cpucp_kernel_ver.attr,
+	&dev_attr_cpucp_ver.attr,
 	&dev_attr_device_type.attr,
 	&dev_attr_fuse_ver.attr,
 	&dev_attr_hard_reset.attr,
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@ -21,7 +21,6 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/iommu.h>
 #include <linux/seq_file.h>
-#include <linux/bitfield.h>

 /*
 * Gaudi security scheme:
@ -360,13 +359,14 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
 static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
 				u32 tpc_id);
 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev);
-static int gaudi_armcp_info_get(struct hl_device *hdev);
+static int gaudi_cpucp_info_get(struct hl_device *hdev);
 static void gaudi_disable_clock_gating(struct hl_device *hdev);
 static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid);

 static int gaudi_get_fixed_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u32 num_sync_stream_queues = 0;
 	int i;

 	prop->max_queues = GAUDI_QUEUE_ID_SIZE;
@ -383,6 +383,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 			prop->hw_queues_props[i].driver_only = 0;
 			prop->hw_queues_props[i].requires_kernel_cb = 1;
 			prop->hw_queues_props[i].supports_sync_stream = 1;
+			num_sync_stream_queues++;
 		} else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) {
 			prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
 			prop->hw_queues_props[i].driver_only = 1;
@ -440,6 +441,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.end_addr =
 			(VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1;
 	prop->pmmu.page_size = PAGE_SIZE_4KB;
+	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;

 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@ -464,11 +466,16 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 	prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
 	prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;

-	strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
+	strncpy(prop->cpucp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
 					CARD_NAME_MAX_LEN);

 	prop->max_pending_cs = GAUDI_MAX_PENDING_CS;

+	prop->first_available_user_sob[HL_GAUDI_WS_DCORE] =
+			num_sync_stream_queues * HL_RSVD_SOBS;
+	prop->first_available_user_mon[HL_GAUDI_WS_DCORE] =
+			num_sync_stream_queues * HL_RSVD_MONS;
+
 	return 0;
 }

@ -592,10 +599,15 @@ static int gaudi_early_init(struct hl_device *hdev)

 	prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID);

-	rc = hl_pci_init(hdev);
+	rc = hl_pci_init(hdev, mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS,
+			mmCPU_BOOT_ERR0, GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC);
 	if (rc)
 		goto free_queue_props;

+	/* GAUDI Firmware does not yet support security */
+	prop->fw_security_disabled = true;
+	dev_info(hdev->dev, "firmware-level security is disabled\n");
+
 	return 0;

 free_queue_props:
@ -675,10 +687,10 @@ static int _gaudi_init_tpc_mem(struct hl_device *hdev,

 	init_tpc_mem_pkt->tsize = cpu_to_le32(tpc_kernel_size);

-	ctl = ((PACKET_LIN_DMA << GAUDI_PKT_CTL_OPCODE_SHIFT) |
-			(1 << GAUDI_PKT_LIN_DMA_CTL_LIN_SHIFT) |
-			(1 << GAUDI_PKT_CTL_RB_SHIFT) |
-			(1 << GAUDI_PKT_CTL_MB_SHIFT));
+	ctl = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
+	ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_LIN_MASK, 1);
+	ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1);
+	ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);

 	init_tpc_mem_pkt->ctl = cpu_to_le32(ctl);

@ -780,13 +792,13 @@ static int gaudi_late_init(struct hl_device *hdev)
 	struct gaudi_device *gaudi = hdev->asic_specific;
 	int rc;

-	rc = gaudi->armcp_info_get(hdev);
+	rc = gaudi->cpucp_info_get(hdev);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to get armcp info\n");
+		dev_err(hdev->dev, "Failed to get cpucp info\n");
 		return rc;
 	}

-	rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
+	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
 		return rc;
@ -811,7 +823,7 @@ static int gaudi_late_init(struct hl_device *hdev)
 	return 0;

 disable_pci_access:
-	hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
+	hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS);

 	return rc;
 }
@ -981,7 +993,7 @@ static int gaudi_sw_init(struct hl_device *hdev)
 		}
 	}

-	gaudi->armcp_info_get = gaudi_armcp_info_get;
+	gaudi->cpucp_info_get = gaudi_cpucp_info_get;

 	gaudi->max_freq_value = GAUDI_MAX_CLK_FREQ;

@ -1911,6 +1923,9 @@ static void gaudi_init_dma_core(struct hl_device *hdev, int dma_id)
 	WREG32(mmDMA0_CORE_RD_MAX_OUTSTAND + dma_offset, 0);
 	WREG32(mmDMA0_CORE_RD_MAX_SIZE + dma_offset, 0);

+	/* WA for H/W bug H3-2116 */
+	WREG32(mmDMA0_CORE_LBW_MAX_OUTSTAND + dma_offset, 15);
+
 	/* STOP_ON bit implies no completion to operation in case of RAZWI */
 	if (hdev->stop_on_err)
 		dma_err_cfg |= 1 << DMA0_CORE_ERR_CFG_STOP_ON_ERR_SHIFT;
@ -2321,7 +2336,8 @@ static void gaudi_init_tpc_qmans(struct hl_device *hdev)

 		tpc_offset += mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0;

-		gaudi->hw_cap_initialized |= 1 << (HW_CAP_TPC_SHIFT + tpc_id);
+		gaudi->hw_cap_initialized |=
+				FIELD_PREP(HW_CAP_TPC_MASK, 1 << tpc_id);
 	}
 }

@ -2847,7 +2863,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout)

 	if (err) {
 		dev_err(hdev->dev,
-			"Failed to communicate with ARM CPU (ArmCP timeout)\n");
+			"Failed to communicate with Device CPU (CPU-CP timeout)\n");
 		return -EIO;
 	}

@ -2860,6 +2876,18 @@ static void gaudi_pre_hw_init(struct hl_device *hdev)
 	/* Perform read from the device to make sure device is up */
 	RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG);

+	/* Set the access through PCI bars (Linux driver only) as
+	 * secured
+	 */
+	WREG32(mmPCIE_WRAP_LBW_PROT_OVR,
+			(PCIE_WRAP_LBW_PROT_OVR_RD_EN_MASK |
+			PCIE_WRAP_LBW_PROT_OVR_WR_EN_MASK));
+
+	/* Perform read to flush the waiting writes to ensure
+	 * configuration was set in the device
+	 */
+	RREG32(mmPCIE_WRAP_LBW_PROT_OVR);
+
 	/*
 	 * Let's mark in the H/W that we have reached this point. We check
 	 * this value in the reset_before_init function to understand whether
@ -2868,31 +2896,6 @@ static void gaudi_pre_hw_init(struct hl_device *hdev)
 	 */
 	WREG32(mmHW_STATE, HL_DEVICE_HW_STATE_DIRTY);

-	/* Set the access through PCI bars (Linux driver only) as secured */
-	WREG32(mmPCIE_WRAP_LBW_PROT_OVR, (PCIE_WRAP_LBW_PROT_OVR_RD_EN_MASK |
-					PCIE_WRAP_LBW_PROT_OVR_WR_EN_MASK));
-
-	/* Perform read to flush the waiting writes to ensure configuration
-	 * was set in the device
-	 */
-	RREG32(mmPCIE_WRAP_LBW_PROT_OVR);
-
-	if (hdev->axi_drain) {
-		WREG32(mmPCIE_WRAP_LBW_DRAIN_CFG,
-			1 << PCIE_WRAP_LBW_DRAIN_CFG_EN_SHIFT);
-		WREG32(mmPCIE_WRAP_HBW_DRAIN_CFG,
-			1 << PCIE_WRAP_HBW_DRAIN_CFG_EN_SHIFT);
-
-		/* Perform read to flush the DRAIN cfg */
-		RREG32(mmPCIE_WRAP_HBW_DRAIN_CFG);
-	} else {
-		WREG32(mmPCIE_WRAP_LBW_DRAIN_CFG, 0);
-		WREG32(mmPCIE_WRAP_HBW_DRAIN_CFG, 0);
-
-		/* Perform read to flush the DRAIN cfg */
-		RREG32(mmPCIE_WRAP_HBW_DRAIN_CFG);
-	}
-
 	/* Configure the reset registers. Must be done as early as possible
 	 * in case we fail during H/W initialization
 	 */
@ -2900,13 +2903,13 @@ static void gaudi_pre_hw_init(struct hl_device *hdev)
 					(CFG_RST_H_DMA_MASK |
 					CFG_RST_H_MME_MASK |
 					CFG_RST_H_SM_MASK |
-					CFG_RST_H_TPC_MASK));
+					CFG_RST_H_TPC_7_MASK));

 	WREG32(mmPSOC_GLOBAL_CONF_SOFT_RST_CFG_L, CFG_RST_L_TPC_MASK);

 	WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST_CFG_H,
 					(CFG_RST_H_HBM_MASK |
-					CFG_RST_H_TPC_MASK |
+					CFG_RST_H_TPC_7_MASK |
 					CFG_RST_H_NIC_MASK |
 					CFG_RST_H_SM_MASK |
 					CFG_RST_H_DMA_MASK |
@ -3071,7 +3074,7 @@ static int gaudi_suspend(struct hl_device *hdev)
 {
 	int rc;

-	rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
+	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS);
 	if (rc)
 		dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");

@ -3084,17 +3087,16 @@ static int gaudi_resume(struct hl_device *hdev)
 }

 static int gaudi_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
-		u64 kaddress, phys_addr_t paddress, u32 size)
+			void *cpu_addr, dma_addr_t dma_addr, size_t size)
 {
 	int rc;

 	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP |
 			VM_DONTCOPY | VM_NORESERVE;

-	rc = remap_pfn_range(vma, vma->vm_start, paddress >> PAGE_SHIFT,
-				size, vma->vm_page_prot);
+	rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, dma_addr, size);
 	if (rc)
-		dev_err(hdev->dev, "remap_pfn_range error %d", rc);
+		dev_err(hdev->dev, "dma_mmap_coherent error %d", rc);

 	return rc;
 }
@ -3441,7 +3443,8 @@ static int gaudi_test_queue(struct hl_device *hdev, u32 hw_queue_id)
 							&fence_dma_addr);
 	if (!fence_ptr) {
 		dev_err(hdev->dev,
-			"Failed to allocate memory for queue testing\n");
+			"Failed to allocate memory for H/W queue %d testing\n",
+			hw_queue_id);
 		return -ENOMEM;
 	}

@ -3452,14 +3455,16 @@ static int gaudi_test_queue(struct hl_device *hdev, u32 hw_queue_id)
 					GFP_KERNEL, &pkt_dma_addr);
 	if (!fence_pkt) {
 		dev_err(hdev->dev,
-			"Failed to allocate packet for queue testing\n");
+			"Failed to allocate packet for H/W queue %d testing\n",
+			hw_queue_id);
 		rc = -ENOMEM;
 		goto free_fence_ptr;
 	}

-	tmp = (PACKET_MSG_PROT << GAUDI_PKT_CTL_OPCODE_SHIFT) |
-			(1 << GAUDI_PKT_CTL_EB_SHIFT) |
-			(1 << GAUDI_PKT_CTL_MB_SHIFT);
+	tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT);
+	tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);
+	tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
+
 	fence_pkt->ctl = cpu_to_le32(tmp);
 	fence_pkt->value = cpu_to_le32(fence_val);
 	fence_pkt->addr = cpu_to_le64(fence_dma_addr);
@ -3469,7 +3474,8 @@ static int gaudi_test_queue(struct hl_device *hdev, u32 hw_queue_id)
 					pkt_dma_addr);
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to send fence packet\n");
+			"Failed to send fence packet to H/W queue %d\n",
+			hw_queue_id);
 		goto free_pkt;
 	}

@ -3959,8 +3965,6 @@ static int gaudi_patch_dma_packet(struct hl_device *hdev,
 			}
 		}

-		new_dma_pkt->ctl = user_dma_pkt->ctl;
-
 		ctl = le32_to_cpu(user_dma_pkt->ctl);
 		if (likely(dma_desc_cnt))
 			ctl &= ~GAUDI_PKT_CTL_EB_MASK;
@ -4105,8 +4109,9 @@ static int gaudi_parse_cb_mmu(struct hl_device *hdev,
 	parser->patched_cb_size = parser->user_cb_size +
 			sizeof(struct packet_msg_prot) * 2;

-	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
-			&patched_cb_handle, HL_KERNEL_ASID_ID, false);
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx,
+				parser->patched_cb_size, false, false,
+				&patched_cb_handle);

 	if (rc) {
 		dev_err(hdev->dev,
@ -4178,8 +4183,9 @@ static int gaudi_parse_cb_no_mmu(struct hl_device *hdev,
 	if (rc)
 		goto free_userptr;

-	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
-			&patched_cb_handle, HL_KERNEL_ASID_ID, false);
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx,
+				parser->patched_cb_size, false, false,
+				&patched_cb_handle);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to allocate patched CB for DMA CS %d\n", rc);
@ -4275,11 +4281,11 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev,
 	cq_pkt = (struct packet_msg_prot *) (uintptr_t)
 		(kernel_address + len - (sizeof(struct packet_msg_prot) * 2));

-	tmp = (PACKET_MSG_PROT << GAUDI_PKT_CTL_OPCODE_SHIFT) |
-			(1 << GAUDI_PKT_CTL_MB_SHIFT);
+	tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT);
+	tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);

 	if (eb)
-		tmp |= (1 << GAUDI_PKT_CTL_EB_SHIFT);
+		tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);

 	cq_pkt->ctl = cpu_to_le32(tmp);
 	cq_pkt->value = cpu_to_le32(cq_val);
@ -4287,8 +4293,8 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev,

 	cq_pkt++;

-	tmp = (PACKET_MSG_PROT << GAUDI_PKT_CTL_OPCODE_SHIFT) |
-			(1 << GAUDI_PKT_CTL_MB_SHIFT);
+	tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT);
+	tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
 	cq_pkt->ctl = cpu_to_le32(tmp);
 	cq_pkt->value = cpu_to_le32(1);

@ -4320,11 +4326,12 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
 	memset(lin_dma_pkt, 0, sizeof(*lin_dma_pkt));
 	cb_size = sizeof(*lin_dma_pkt);

-	ctl = ((PACKET_LIN_DMA << GAUDI_PKT_CTL_OPCODE_SHIFT) |
-			(1 << GAUDI_PKT_LIN_DMA_CTL_MEMSET_SHIFT) |
-			(1 << GAUDI_PKT_LIN_DMA_CTL_LIN_SHIFT) |
-			(1 << GAUDI_PKT_CTL_RB_SHIFT) |
-			(1 << GAUDI_PKT_CTL_MB_SHIFT));
+	ctl = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
+	ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_MEMSET_MASK, 1);
+	ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_LIN_MASK, 1);
+	ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
+	ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1);
+
 	lin_dma_pkt->ctl = cpu_to_le32(ctl);
 	lin_dma_pkt->src_addr = cpu_to_le64(val);
 	lin_dma_pkt->dst_addr |= cpu_to_le64(addr);
@ -4930,9 +4937,10 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
 	fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address +
 			job->job_cb_size - sizeof(struct packet_msg_prot));

-	tmp = (PACKET_MSG_PROT << GAUDI_PKT_CTL_OPCODE_SHIFT) |
-			(1 << GAUDI_PKT_CTL_EB_SHIFT) |
-			(1 << GAUDI_PKT_CTL_MB_SHIFT);
+	tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT);
+	tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);
+	tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
+
 	fence_pkt->ctl = cpu_to_le32(tmp);
 	fence_pkt->value = cpu_to_le32(GAUDI_QMAN0_FENCE_VAL);
 	fence_pkt->addr = cpu_to_le64(fence_dma_addr);
@ -5606,7 +5614,7 @@ static bool gaudi_tpc_read_interrupts(struct hl_device *hdev, u8 tpc_id,
 	bool soft_reset_required = false;

 	/* Accessing the TPC_INTR_CAUSE registers requires disabling the clock
-	 * gating, and thus cannot be done in ArmCP and should be done instead
+	 * gating, and thus cannot be done in CPU-CP and should be done instead
 	 * by the driver.
 	 */

@ -5653,21 +5661,25 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
 {
 	switch (event_type) {
 	case GAUDI_EVENT_FIX_POWER_ENV_S:
+		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to power consumption\n");
 		break;

 	case GAUDI_EVENT_FIX_POWER_ENV_E:
+		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
 		dev_info_ratelimited(hdev->dev,
 			"Power envelop is safe, back to optimal clock\n");
 		break;

 	case GAUDI_EVENT_FIX_THERMAL_ENV_S:
+		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to overheating\n");
 		break;

 	case GAUDI_EVENT_FIX_THERMAL_ENV_E:
+		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
 		dev_info_ratelimited(hdev->dev,
 			"Thermal envelop is safe, back to optimal clock\n");
 		break;
@ -6038,7 +6050,7 @@ static int gaudi_send_heartbeat(struct hl_device *hdev)
 	return hl_fw_send_heartbeat(hdev);
 }

-static int gaudi_armcp_info_get(struct hl_device *hdev)
+static int gaudi_cpucp_info_get(struct hl_device *hdev)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@ -6047,19 +6059,19 @@ static int gaudi_armcp_info_get(struct hl_device *hdev)
 	if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q))
 		return 0;

-	rc = hl_fw_armcp_info_get(hdev);
+	rc = hl_fw_cpucp_info_get(hdev);
 	if (rc)
 		return rc;

-	if (!strlen(prop->armcp_info.card_name))
-		strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
+	if (!strlen(prop->cpucp_info.card_name))
+		strncpy(prop->cpucp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
 				CARD_NAME_MAX_LEN);

-	hdev->card_type = le32_to_cpu(hdev->asic_prop.armcp_info.card_type);
+	hdev->card_type = le32_to_cpu(hdev->asic_prop.cpucp_info.card_type);

-	if (hdev->card_type == armcp_card_type_pci)
+	if (hdev->card_type == cpucp_card_type_pci)
 		prop->max_power_default = MAX_POWER_DEFAULT_PCI;
-	else if (hdev->card_type == armcp_card_type_pmc)
+	else if (hdev->card_type == cpucp_card_type_pmc)
 		prop->max_power_default = MAX_POWER_DEFAULT_PMC;

 	hdev->max_power = prop->max_power_default;
@ -6067,7 +6079,7 @@ static int gaudi_armcp_info_get(struct hl_device *hdev)
 	return 0;
 }

-static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
+static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask,
 					struct seq_file *s)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
@ -6099,7 +6111,7 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
 		is_idle &= is_eng_idle;

 		if (mask)
-			*mask |= !is_eng_idle <<
+			*mask |= ((u64) !is_eng_idle) <<
 					(GAUDI_ENGINE_ID_DMA_0 + dma_id);
 		if (s)
 			seq_printf(s, fmt, dma_id,
@ -6122,7 +6134,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
 		is_idle &= is_eng_idle;

 		if (mask)
-			*mask |= !is_eng_idle << (GAUDI_ENGINE_ID_TPC_0 + i);
+			*mask |= ((u64) !is_eng_idle) <<
+						(GAUDI_ENGINE_ID_TPC_0 + i);
 		if (s)
 			seq_printf(s, fmt, i,
 				is_eng_idle ? "Y" : "N",
@ -6150,7 +6163,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
 		is_idle &= is_eng_idle;

 		if (mask)
-			*mask |= !is_eng_idle << (GAUDI_ENGINE_ID_MME_0 + i);
+			*mask |= ((u64) !is_eng_idle) <<
+						(GAUDI_ENGINE_ID_MME_0 + i);
 		if (s) {
 			if (!is_slave)
 				seq_printf(s, fmt, i,
@ -6288,6 +6302,15 @@ static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
 		1000,
 		kernel_timeout);

+	if (rc) {
+		dev_err(hdev->dev,
+			"Timeout while waiting for TPC%d vector pipe\n",
+			tpc_id);
+		hdev->asic_funcs->set_clock_gating(hdev);
+		mutex_unlock(&gaudi->clk_gate_mutex);
+		return -EIO;
+	}
+
 	rc = hl_poll_timeout(
 		hdev,
 		mmTPC0_CFG_WQ_INFLIGHT_CNTR + offset,
@ -6617,7 +6640,6 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.send_cpu_message = gaudi_send_cpu_message,
 	.get_hw_state = gaudi_get_hw_state,
 	.pci_bars_map = gaudi_pci_bars_map,
-	.set_dram_bar_base = gaudi_set_hbm_bar_base,
 	.init_iatu = gaudi_init_iatu,
 	.rreg = hl_rreg,
 	.wreg = hl_wreg,
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@ -35,8 +35,6 @@
 #error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
 #endif

-#define QMAN_FENCE_TIMEOUT_USEC		10000		/* 10 ms */
-
 #define CORESIGHT_TIMEOUT_USEC		100000		/* 100 ms */

 #define GAUDI_MAX_CLK_FREQ		2200000000ull	/* 2200 MHz */
@ -44,7 +42,7 @@
 #define MAX_POWER_DEFAULT_PCI		200000		/* 200W */
 #define MAX_POWER_DEFAULT_PMC		350000		/* 350W */

-#define GAUDI_CPU_TIMEOUT_USEC		15000000	/* 15s */
+#define GAUDI_CPU_TIMEOUT_USEC		30000000	/* 30s */

 #define TPC_ENABLED_MASK		0xFF

@ -142,28 +140,28 @@
 #define VA_HOST_SPACE_SIZE	(VA_HOST_SPACE_END - \
 					VA_HOST_SPACE_START) /* 767TB */

-#define HW_CAP_PLL		0x00000001
-#define HW_CAP_HBM		0x00000002
-#define HW_CAP_MMU		0x00000004
-#define HW_CAP_MME		0x00000008
-#define HW_CAP_CPU		0x00000010
-#define HW_CAP_PCI_DMA		0x00000020
-#define HW_CAP_MSI		0x00000040
-#define HW_CAP_CPU_Q		0x00000080
-#define HW_CAP_HBM_DMA		0x00000100
-#define HW_CAP_CLK_GATE		0x00000200
-#define HW_CAP_SRAM_SCRAMBLER	0x00000400
-#define HW_CAP_HBM_SCRAMBLER	0x00000800
+#define HW_CAP_PLL		BIT(0)
+#define HW_CAP_HBM		BIT(1)
+#define HW_CAP_MMU		BIT(2)
+#define HW_CAP_MME		BIT(3)
+#define HW_CAP_CPU		BIT(4)
+#define HW_CAP_PCI_DMA		BIT(5)
+#define HW_CAP_MSI		BIT(6)
+#define HW_CAP_CPU_Q		BIT(7)
+#define HW_CAP_HBM_DMA		BIT(8)
+#define HW_CAP_CLK_GATE		BIT(9)
+#define HW_CAP_SRAM_SCRAMBLER	BIT(10)
+#define HW_CAP_HBM_SCRAMBLER	BIT(11)

-#define HW_CAP_TPC0		0x01000000
-#define HW_CAP_TPC1		0x02000000
-#define HW_CAP_TPC2		0x04000000
-#define HW_CAP_TPC3		0x08000000
-#define HW_CAP_TPC4		0x10000000
-#define HW_CAP_TPC5		0x20000000
-#define HW_CAP_TPC6		0x40000000
-#define HW_CAP_TPC7		0x80000000
-#define HW_CAP_TPC_MASK		0xFF000000
+#define HW_CAP_TPC0		BIT(24)
+#define HW_CAP_TPC1		BIT(25)
+#define HW_CAP_TPC2		BIT(26)
+#define HW_CAP_TPC3		BIT(27)
+#define HW_CAP_TPC4		BIT(28)
+#define HW_CAP_TPC5		BIT(29)
+#define HW_CAP_TPC6		BIT(30)
+#define HW_CAP_TPC7		BIT(31)
+#define HW_CAP_TPC_MASK		GENMASK(31, 24)
 #define HW_CAP_TPC_SHIFT	24

 #define GAUDI_CPU_PCI_MSB_ADDR(addr)	(((addr) & GENMASK_ULL(49, 39)) >> 39)
@ -216,7 +214,7 @@ struct gaudi_internal_qman_info {

 /**
 * struct gaudi_device - ASIC specific manage structure.
- * @armcp_info_get: get information on device from ArmCP
+ * @cpucp_info_get: get information on device from CPU-CP
 * @hw_queues_lock: protects the H/W queues from concurrent access.
 * @clk_gate_mutex: protects code areas that require clock gating to be disabled
 *                  temporarily
@ -239,7 +237,7 @@ struct gaudi_internal_qman_info {
 *                    8-bit value so use u8.
 */
 struct gaudi_device {
-	int (*armcp_info_get)(struct hl_device *hdev);
+	int (*cpucp_info_get)(struct hl_device *hdev);

 	/* TODO: remove hw_queues_lock after moving to scheduler code */
 	spinlock_t			hw_queues_lock;
--- a/drivers/misc/habanalabs/gaudi/gaudi_security.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_security.c
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@ -426,12 +426,14 @@ int goya_get_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.start_addr = VA_DDR_SPACE_START;
 	prop->dmmu.end_addr = VA_DDR_SPACE_END;
 	prop->dmmu.page_size = PAGE_SIZE_2MB;
+	prop->dmmu.num_hops = MMU_ARCH_5_HOPS;

 	/* shifts and masks are the same in PMMU and DMMU */
 	memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
 	prop->pmmu.start_addr = VA_HOST_SPACE_START;
 	prop->pmmu.end_addr = VA_HOST_SPACE_END;
 	prop->pmmu.page_size = PAGE_SIZE_4KB;
+	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;

 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@ -449,7 +451,7 @@ int goya_get_fixed_properties(struct hl_device *hdev)
 	prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
 	prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;

-	strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
+	strncpy(prop->cpucp_info.card_name, GOYA_DEFAULT_CARD_NAME,
 		CARD_NAME_MAX_LEN);

 	prop->max_pending_cs = GOYA_MAX_PENDING_CS;
@ -598,10 +600,15 @@ static int goya_early_init(struct hl_device *hdev)

 	prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID);

-	rc = hl_pci_init(hdev);
+	rc = hl_pci_init(hdev, mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS,
+			mmCPU_BOOT_ERR0, GOYA_BOOT_FIT_REQ_TIMEOUT_USEC);
 	if (rc)
 		goto free_queue_props;

+	/* Goya Firmware does not support security */
+	prop->fw_security_disabled = true;
+	dev_info(hdev->dev, "firmware-level security is disabled\n");
+
 	if (!hdev->pldm) {
 		val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
 		if (val & PSOC_GLOBAL_CONF_BOOT_STRAP_PINS_SRIOV_EN_MASK)
@ -727,9 +734,9 @@ int goya_late_init(struct hl_device *hdev)
 	if (rc)
 		return rc;

-	rc = goya_armcp_info_get(hdev);
+	rc = goya_cpucp_info_get(hdev);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to get armcp info %d\n", rc);
+		dev_err(hdev->dev, "Failed to get cpucp info %d\n", rc);
 		return rc;
 	}

@ -739,7 +746,7 @@ int goya_late_init(struct hl_device *hdev)
 	 */
 	WREG32(mmMMU_LOG2_DDR_SIZE, ilog2(prop->dram_size));

-	rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
+	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to enable PCI access from CPU %d\n", rc);
@ -2648,7 +2655,7 @@ int goya_suspend(struct hl_device *hdev)
 {
 	int rc;

-	rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
+	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS);
 	if (rc)
 		dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");

@ -2661,17 +2668,16 @@ int goya_resume(struct hl_device *hdev)
 }

 static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
-		u64 kaddress, phys_addr_t paddress, u32 size)
+			void *cpu_addr, dma_addr_t dma_addr, size_t size)
 {
 	int rc;

 	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP |
 			VM_DONTCOPY | VM_NORESERVE;

-	rc = remap_pfn_range(vma, vma->vm_start, paddress >> PAGE_SHIFT,
-				size, vma->vm_page_prot);
+	rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, dma_addr, size);
 	if (rc)
-		dev_err(hdev->dev, "remap_pfn_range error %d", rc);
+		dev_err(hdev->dev, "dma_mmap_coherent error %d", rc);

 	return rc;
 }
@ -2946,7 +2952,8 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
 							&fence_dma_addr);
 	if (!fence_ptr) {
 		dev_err(hdev->dev,
-			"Failed to allocate memory for queue testing\n");
+			"Failed to allocate memory for H/W queue %d testing\n",
+			hw_queue_id);
 		return -ENOMEM;
 	}

@ -2957,7 +2964,8 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
 					GFP_KERNEL, &pkt_dma_addr);
 	if (!fence_pkt) {
 		dev_err(hdev->dev,
-			"Failed to allocate packet for queue testing\n");
+			"Failed to allocate packet for H/W queue %d testing\n",
+			hw_queue_id);
 		rc = -ENOMEM;
 		goto free_fence_ptr;
 	}
@ -2974,7 +2982,8 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
 					pkt_dma_addr);
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to send fence packet\n");
+			"Failed to send fence packet to H/W queue %d\n",
+			hw_queue_id);
 		goto free_pkt;
 	}

@ -3806,8 +3815,9 @@ static int goya_parse_cb_mmu(struct hl_device *hdev,
 	parser->patched_cb_size = parser->user_cb_size +
 			sizeof(struct packet_msg_prot) * 2;

-	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
-			&patched_cb_handle, HL_KERNEL_ASID_ID, false);
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx,
+				parser->patched_cb_size, false, false,
+				&patched_cb_handle);

 	if (rc) {
 		dev_err(hdev->dev,
@ -3879,8 +3889,9 @@ static int goya_parse_cb_no_mmu(struct hl_device *hdev,
 	if (rc)
 		goto free_userptr;

-	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
-			&patched_cb_handle, HL_KERNEL_ASID_ID, false);
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx,
+				parser->patched_cb_size, false, false,
+				&patched_cb_handle);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to allocate patched CB for DMA CS %d\n", rc);
@ -4497,17 +4508,17 @@ static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
 static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
 		size_t irq_arr_size)
 {
-	struct armcp_unmask_irq_arr_packet *pkt;
+	struct cpucp_unmask_irq_arr_packet *pkt;
 	size_t total_pkt_size;
 	long result;
 	int rc;
 	int irq_num_entries, irq_arr_index;
 	__le32 *goya_irq_arr;

-	total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) +
+	total_pkt_size = sizeof(struct cpucp_unmask_irq_arr_packet) +
 			irq_arr_size;

-	/* data should be aligned to 8 bytes in order to ArmCP to copy it */
+	/* data should be aligned to 8 bytes in order to CPU-CP to copy it */
 	total_pkt_size = (total_pkt_size + 0x7) & ~0x7;

 	/* total_pkt_size is casted to u16 later on */
@ -4531,8 +4542,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
 		goya_irq_arr[irq_arr_index] =
 				cpu_to_le32(irq_arr[irq_arr_index]);

-	pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
 						total_pkt_size,	0, &result);
@ -4557,14 +4568,14 @@ static int goya_soft_reset_late_init(struct hl_device *hdev)

 static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.value = cpu_to_le64(event_type);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@ -4580,18 +4591,22 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
 {
 	switch (event_type) {
 	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
+		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to power consumption\n");
 		break;
 	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
+		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
 		dev_info_ratelimited(hdev->dev,
 			"Power envelop is safe, back to optimal clock\n");
 		break;
 	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
+		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to overheating\n");
 		break;
 	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
+		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
 		dev_info_ratelimited(hdev->dev,
 			"Thermal envelop is safe, back to optimal clock\n");
 		break;
@ -4638,7 +4653,8 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 	case GOYA_ASYNC_EVENT_ID_L2_RAM_ECC:
 	case GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET:
 		goya_print_irq_info(hdev, event_type, false);
-		hl_device_reset(hdev, true, false);
+		if (hdev->hard_reset_on_fw_events)
+			hl_device_reset(hdev, true, false);
 		break;

 	case GOYA_ASYNC_EVENT_ID_PCIE_DEC:
@ -5096,7 +5112,7 @@ int goya_send_heartbeat(struct hl_device *hdev)
 	return hl_fw_send_heartbeat(hdev);
 }

-int goya_armcp_info_get(struct hl_device *hdev)
+int goya_cpucp_info_get(struct hl_device *hdev)
 {
 	struct goya_device *goya = hdev->asic_specific;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@ -5106,11 +5122,11 @@ int goya_armcp_info_get(struct hl_device *hdev)
 	if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q))
 		return 0;

-	rc = hl_fw_armcp_info_get(hdev);
+	rc = hl_fw_cpucp_info_get(hdev);
 	if (rc)
 		return rc;

-	dram_size = le64_to_cpu(prop->armcp_info.dram_size);
+	dram_size = le64_to_cpu(prop->cpucp_info.dram_size);
 	if (dram_size) {
 		if ((!is_power_of_2(dram_size)) ||
 				(dram_size < DRAM_PHYS_DEFAULT_SIZE)) {
@ -5124,8 +5140,8 @@ int goya_armcp_info_get(struct hl_device *hdev)
 		prop->dram_end_address = prop->dram_base_address + dram_size;
 	}

-	if (!strlen(prop->armcp_info.card_name))
-		strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
+	if (!strlen(prop->cpucp_info.card_name))
+		strncpy(prop->cpucp_info.card_name, GOYA_DEFAULT_CARD_NAME,
 				CARD_NAME_MAX_LEN);

 	return 0;
@ -5141,7 +5157,7 @@ static void goya_disable_clock_gating(struct hl_device *hdev)
 	/* clock gating not supported in Goya */
 }

-static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
+static bool goya_is_device_idle(struct hl_device *hdev, u64 *mask,
 				struct seq_file *s)
 {
 	const char *fmt = "%-5d%-9s%#-14x%#-16x%#x\n";
@ -5166,7 +5182,8 @@ static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
 		is_idle &= is_eng_idle;

 		if (mask)
-			*mask |= !is_eng_idle << (GOYA_ENGINE_ID_DMA_0 + i);
+			*mask |= ((u64) !is_eng_idle) <<
+						(GOYA_ENGINE_ID_DMA_0 + i);
 		if (s)
 			seq_printf(s, dma_fmt, i, is_eng_idle ? "Y" : "N",
 					qm_glbl_sts0, dma_core_sts0);
@ -5189,7 +5206,8 @@ static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
 		is_idle &= is_eng_idle;

 		if (mask)
-			*mask |= !is_eng_idle << (GOYA_ENGINE_ID_TPC_0 + i);
+			*mask |= ((u64) !is_eng_idle) <<
+						(GOYA_ENGINE_ID_TPC_0 + i);
 		if (s)
 			seq_printf(s, fmt, i, is_eng_idle ? "Y" : "N",
 				qm_glbl_sts0, cmdq_glbl_sts0, tpc_cfg_sts);
@ -5209,7 +5227,7 @@ static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
 	is_idle &= is_eng_idle;

 	if (mask)
-		*mask |= !is_eng_idle << GOYA_ENGINE_ID_MME_0;
+		*mask |= ((u64) !is_eng_idle) << GOYA_ENGINE_ID_MME_0;
 	if (s) {
 		seq_printf(s, fmt, 0, is_eng_idle ? "Y" : "N", qm_glbl_sts0,
 				cmdq_glbl_sts0, mme_arch_sts);
@ -5369,7 +5387,6 @@ static const struct hl_asic_funcs goya_funcs = {
 	.send_cpu_message = goya_send_cpu_message,
 	.get_hw_state = goya_get_hw_state,
 	.pci_bars_map = goya_pci_bars_map,
-	.set_dram_bar_base = goya_set_ddr_bar_base,
 	.init_iatu = goya_init_iatu,
 	.rreg = hl_rreg,
 	.wreg = hl_wreg,
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@ -207,7 +207,7 @@ void goya_set_max_power(struct hl_device *hdev, u64 value);
 void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq);
 void goya_add_device_attr(struct hl_device *hdev,
 			struct attribute_group *dev_attr_grp);
-int goya_armcp_info_get(struct hl_device *hdev);
+int goya_cpucp_info_get(struct hl_device *hdev);
 int goya_debug_coresight(struct hl_device *hdev, void *data);
 void goya_halt_coresight(struct hl_device *hdev);

--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@ -1,12 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0
 *
- * Copyright 2016-2020 HabanaLabs, Ltd.
+ * Copyright 2020 HabanaLabs, Ltd.
 * All Rights Reserved.
 *
 */

-#ifndef ARMCP_IF_H
-#define ARMCP_IF_H
+#ifndef CPUCP_IF_H
+#define CPUCP_IF_H

 #include <linux/types.h>

@ -50,16 +50,16 @@ enum pq_init_status {
 };

 /*
- * ArmCP Primary Queue Packets
+ * CpuCP Primary Queue Packets
 *
 * During normal operation, the host's kernel driver needs to send various
- * messages to ArmCP, usually either to SET some value into a H/W periphery or
+ * messages to CpuCP, usually either to SET some value into a H/W periphery or
 * to GET the current value of some H/W periphery. For example, SET the
 * frequency of MME/TPC and GET the value of the thermal sensor.
 *
 * These messages can be initiated either by the User application or by the
 * host's driver itself, e.g. power management code. In either case, the
- * communication from the host's driver to ArmCP will *always* be in
+ * communication from the host's driver to CpuCP will *always* be in
 * synchronous mode, meaning that the host will send a single message and poll
 * until the message was acknowledged and the results are ready (if results are
 * needed).
@ -73,21 +73,20 @@ enum pq_init_status {
 *
 * The message, inputs/outputs (if relevant) and fence object will be located
 * on the device DDR at an address that will be determined by the host's driver.
- * During device initialization phase, the host will pass to ArmCP that address.
+ * During device initialization phase, the host will pass to CpuCP that address.
 * Most of the message types will contain inputs/outputs inside the message
 * itself. The common part of each message will contain the opcode of the
 * message (its type) and a field representing a fence object.
 *
- * When the host's driver wishes to send a message to ArmCP, it will write the
- * message contents to the device DDR, clear the fence object and then write the
- * value 484 to the mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR register to issue
- * the 484 interrupt-id to the ARM core.
+ * When the host's driver wishes to send a message to CPU CP, it will write the
+ * message contents to the device DDR, clear the fence object and then write to
+ * the PSOC_ARC1_AUX_SW_INTR, to issue interrupt 121 to ARC Management CPU.
 *
- * Upon receiving the 484 interrupt-id, ArmCP will read the message from the
- * DDR. In case the message is a SET operation, ArmCP will first perform the
+ * Upon receiving the interrupt (#121), CpuCP will read the message from the
+ * DDR. In case the message is a SET operation, CpuCP will first perform the
 * operation and then write to the fence object on the device DDR. In case the
- * message is a GET operation, ArmCP will first fill the results section on the
- * device DDR and then write to the fence object. If an error occurred, ArmCP
+ * message is a GET operation, CpuCP will first fill the results section on the
+ * device DDR and then write to the fence object. If an error occurred, CpuCP
 * will fill the rc field with the right error code.
 *
 * In the meantime, the host's driver will poll on the fence object. Once the
@ -96,164 +95,174 @@ enum pq_init_status {
 * driver.
 *
 * To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8
- * so the value being put by the host's driver matches the value read by ArmCP
+ * so the value being put by the host's driver matches the value read by CpuCP
 *
 * Non-QMAN packets should be limited to values 1 through (2^8 - 1)
 *
 * Detailed description:
 *
- * ARMCP_PACKET_DISABLE_PCI_ACCESS -
+ * CPUCP_PACKET_DISABLE_PCI_ACCESS -
 *       After receiving this packet the embedded CPU must NOT issue PCI
 *       transactions (read/write) towards the Host CPU. This also include
 *       sending MSI-X interrupts.
 *       This packet is usually sent before the device is moved to D3Hot state.
 *
- * ARMCP_PACKET_ENABLE_PCI_ACCESS -
+ * CPUCP_PACKET_ENABLE_PCI_ACCESS -
 *       After receiving this packet the embedded CPU is allowed to issue PCI
 *       transactions towards the Host CPU, including sending MSI-X interrupts.
 *       This packet is usually send after the device is moved to D0 state.
 *
- * ARMCP_PACKET_TEMPERATURE_GET -
+ * CPUCP_PACKET_TEMPERATURE_GET -
 *       Fetch the current temperature / Max / Max Hyst / Critical /
 *       Critical Hyst of a specified thermal sensor. The packet's
 *       arguments specify the desired sensor and the field to get.
 *
- * ARMCP_PACKET_VOLTAGE_GET -
+ * CPUCP_PACKET_VOLTAGE_GET -
 *       Fetch the voltage / Max / Min of a specified sensor. The packet's
 *       arguments specify the sensor and type.
 *
- * ARMCP_PACKET_CURRENT_GET -
+ * CPUCP_PACKET_CURRENT_GET -
 *       Fetch the current / Max / Min of a specified sensor. The packet's
 *       arguments specify the sensor and type.
 *
- * ARMCP_PACKET_FAN_SPEED_GET -
+ * CPUCP_PACKET_FAN_SPEED_GET -
 *       Fetch the speed / Max / Min of a specified fan. The packet's
 *       arguments specify the sensor and type.
 *
- * ARMCP_PACKET_PWM_GET -
+ * CPUCP_PACKET_PWM_GET -
 *       Fetch the pwm value / mode of a specified pwm. The packet's
 *       arguments specify the sensor and type.
 *
- * ARMCP_PACKET_PWM_SET -
+ * CPUCP_PACKET_PWM_SET -
 *       Set the pwm value / mode of a specified pwm. The packet's
 *       arguments specify the sensor, type and value.
 *
- * ARMCP_PACKET_FREQUENCY_SET -
+ * CPUCP_PACKET_FREQUENCY_SET -
 *       Set the frequency of a specified PLL. The packet's arguments specify
 *       the PLL and the desired frequency. The actual frequency in the device
 *       might differ from the requested frequency.
 *
- * ARMCP_PACKET_FREQUENCY_GET -
+ * CPUCP_PACKET_FREQUENCY_GET -
 *       Fetch the frequency of a specified PLL. The packet's arguments specify
 *       the PLL.
 *
- * ARMCP_PACKET_LED_SET -
+ * CPUCP_PACKET_LED_SET -
 *       Set the state of a specified led. The packet's arguments
 *       specify the led and the desired state.
 *
- * ARMCP_PACKET_I2C_WR -
+ * CPUCP_PACKET_I2C_WR -
 *       Write 32-bit value to I2C device. The packet's arguments specify the
 *       I2C bus, address and value.
 *
- * ARMCP_PACKET_I2C_RD -
+ * CPUCP_PACKET_I2C_RD -
 *       Read 32-bit value from I2C device. The packet's arguments specify the
 *       I2C bus and address.
 *
- * ARMCP_PACKET_INFO_GET -
+ * CPUCP_PACKET_INFO_GET -
 *       Fetch information from the device as specified in the packet's
- *       structure. The host's driver passes the max size it allows the ArmCP to
+ *       structure. The host's driver passes the max size it allows the CpuCP to
 *       write to the structure, to prevent data corruption in case of
 *       mismatched driver/FW versions.
 *
- * ARMCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
+ * CPUCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
 *
- * ARMCP_PACKET_UNMASK_RAZWI_IRQ -
+ * CPUCP_PACKET_UNMASK_RAZWI_IRQ -
 *       Unmask the given IRQ. The IRQ number is specified in the value field.
 *       The packet is sent after receiving an interrupt and printing its
 *       relevant information.
 *
- * ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
+ * CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
 *       Unmask the given IRQs. The IRQs numbers are specified in an array right
- *       after the armcp_packet structure, where its first element is the array
+ *       after the cpucp_packet structure, where its first element is the array
 *       length. The packet is sent after a soft reset was done in order to
 *       handle any interrupts that were sent during the reset process.
 *
- * ARMCP_PACKET_TEST -
- *       Test packet for ArmCP connectivity. The CPU will put the fence value
+ * CPUCP_PACKET_TEST -
+ *       Test packet for CpuCP connectivity. The CPU will put the fence value
 *       in the result field.
 *
- * ARMCP_PACKET_FREQUENCY_CURR_GET -
+ * CPUCP_PACKET_FREQUENCY_CURR_GET -
 *       Fetch the current frequency of a specified PLL. The packet's arguments
 *       specify the PLL.
 *
- * ARMCP_PACKET_MAX_POWER_GET -
+ * CPUCP_PACKET_MAX_POWER_GET -
 *       Fetch the maximal power of the device.
 *
- * ARMCP_PACKET_MAX_POWER_SET -
+ * CPUCP_PACKET_MAX_POWER_SET -
 *       Set the maximal power of the device. The packet's arguments specify
 *       the power.
 *
- * ARMCP_PACKET_EEPROM_DATA_GET -
- *       Get EEPROM data from the ArmCP kernel. The buffer is specified in the
+ * CPUCP_PACKET_EEPROM_DATA_GET -
+ *       Get EEPROM data from the CpuCP kernel. The buffer is specified in the
 *       addr field. The CPU will put the returned data size in the result
 *       field. In addition, the host's driver passes the max size it allows the
- *       ArmCP to write to the structure, to prevent data corruption in case of
+ *       CpuCP to write to the structure, to prevent data corruption in case of
 *       mismatched driver/FW versions.
 *
- * ARMCP_PACKET_TEMPERATURE_SET -
+ * CPUCP_PACKET_TEMPERATURE_SET -
 *       Set the value of the offset property of a specified thermal sensor.
 *       The packet's arguments specify the desired sensor and the field to
 *       set.
 *
- * ARMCP_PACKET_VOLTAGE_SET -
+ * CPUCP_PACKET_VOLTAGE_SET -
 *       Trigger the reset_history property of a specified voltage sensor.
 *       The packet's arguments specify the desired sensor and the field to
 *       set.
 *
- * ARMCP_PACKET_CURRENT_SET -
+ * CPUCP_PACKET_CURRENT_SET -
 *       Trigger the reset_history property of a specified current sensor.
 *       The packet's arguments specify the desired sensor and the field to
 *       set.
+ *
+ * CPUCP_PACKET_PLL_REG_GET
+ *       Fetch register of PLL from the required PLL IP.
+ *       The packet's arguments specify the PLL IP and the register to get.
+ *       Each register is 32-bit value which is returned in result field.
+ *
 */

-enum armcp_packet_id {
-	ARMCP_PACKET_DISABLE_PCI_ACCESS = 1,	/* internal */
-	ARMCP_PACKET_ENABLE_PCI_ACCESS,		/* internal */
-	ARMCP_PACKET_TEMPERATURE_GET,		/* sysfs */
-	ARMCP_PACKET_VOLTAGE_GET,		/* sysfs */
-	ARMCP_PACKET_CURRENT_GET,		/* sysfs */
-	ARMCP_PACKET_FAN_SPEED_GET,		/* sysfs */
-	ARMCP_PACKET_PWM_GET,			/* sysfs */
-	ARMCP_PACKET_PWM_SET,			/* sysfs */
-	ARMCP_PACKET_FREQUENCY_SET,		/* sysfs */
-	ARMCP_PACKET_FREQUENCY_GET,		/* sysfs */
-	ARMCP_PACKET_LED_SET,			/* debugfs */
-	ARMCP_PACKET_I2C_WR,			/* debugfs */
-	ARMCP_PACKET_I2C_RD,			/* debugfs */
-	ARMCP_PACKET_INFO_GET,			/* IOCTL */
-	ARMCP_PACKET_FLASH_PROGRAM_REMOVED,
-	ARMCP_PACKET_UNMASK_RAZWI_IRQ,		/* internal */
-	ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY,	/* internal */
-	ARMCP_PACKET_TEST,			/* internal */
-	ARMCP_PACKET_FREQUENCY_CURR_GET,	/* sysfs */
-	ARMCP_PACKET_MAX_POWER_GET,		/* sysfs */
-	ARMCP_PACKET_MAX_POWER_SET,		/* sysfs */
-	ARMCP_PACKET_EEPROM_DATA_GET,		/* sysfs */
-	ARMCP_RESERVED,
-	ARMCP_PACKET_TEMPERATURE_SET,		/* sysfs */
-	ARMCP_PACKET_VOLTAGE_SET,		/* sysfs */
-	ARMCP_PACKET_CURRENT_SET,		/* sysfs */
+enum cpucp_packet_id {
+	CPUCP_PACKET_DISABLE_PCI_ACCESS = 1,	/* internal */
+	CPUCP_PACKET_ENABLE_PCI_ACCESS,		/* internal */
+	CPUCP_PACKET_TEMPERATURE_GET,		/* sysfs */
+	CPUCP_PACKET_VOLTAGE_GET,		/* sysfs */
+	CPUCP_PACKET_CURRENT_GET,		/* sysfs */
+	CPUCP_PACKET_FAN_SPEED_GET,		/* sysfs */
+	CPUCP_PACKET_PWM_GET,			/* sysfs */
+	CPUCP_PACKET_PWM_SET,			/* sysfs */
+	CPUCP_PACKET_FREQUENCY_SET,		/* sysfs */
+	CPUCP_PACKET_FREQUENCY_GET,		/* sysfs */
+	CPUCP_PACKET_LED_SET,			/* debugfs */
+	CPUCP_PACKET_I2C_WR,			/* debugfs */
+	CPUCP_PACKET_I2C_RD,			/* debugfs */
+	CPUCP_PACKET_INFO_GET,			/* IOCTL */
+	CPUCP_PACKET_FLASH_PROGRAM_REMOVED,
+	CPUCP_PACKET_UNMASK_RAZWI_IRQ,		/* internal */
+	CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY,	/* internal */
+	CPUCP_PACKET_TEST,			/* internal */
+	CPUCP_PACKET_FREQUENCY_CURR_GET,	/* sysfs */
+	CPUCP_PACKET_MAX_POWER_GET,		/* sysfs */
+	CPUCP_PACKET_MAX_POWER_SET,		/* sysfs */
+	CPUCP_PACKET_EEPROM_DATA_GET,		/* sysfs */
+	CPUCP_RESERVED,
+	CPUCP_PACKET_TEMPERATURE_SET,		/* sysfs */
+	CPUCP_PACKET_VOLTAGE_SET,		/* sysfs */
+	CPUCP_PACKET_CURRENT_SET,		/* sysfs */
+	CPUCP_PACKET_PCIE_THROUGHPUT_GET,		/* internal */
+	CPUCP_PACKET_PCIE_REPLAY_CNT_GET,		/* internal */
+	CPUCP_PACKET_TOTAL_ENERGY_GET,		/* internal */
+	CPUCP_PACKET_PLL_REG_GET,		/* internal */
 };

-#define ARMCP_PACKET_FENCE_VAL	0xFE8CE7A5
+#define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5

-#define ARMCP_PKT_CTL_RC_SHIFT		12
-#define ARMCP_PKT_CTL_RC_MASK		0x0000F000
+#define CPUCP_PKT_CTL_RC_SHIFT		12
+#define CPUCP_PKT_CTL_RC_MASK		0x0000F000

-#define ARMCP_PKT_CTL_OPCODE_SHIFT	16
-#define ARMCP_PKT_CTL_OPCODE_MASK	0x1FFF0000
+#define CPUCP_PKT_CTL_OPCODE_SHIFT	16
+#define CPUCP_PKT_CTL_OPCODE_MASK	0x1FFF0000

-struct armcp_packet {
+struct cpucp_packet {
 	union {
 		__le64 value;	/* For SET packets */
 		__le64 result;	/* For GET packets */
@ -277,71 +286,97 @@ struct armcp_packet {
 			__u8 pad; /* unused */
 		};

+		struct {/* For PLL register fetch */
+			__le16 pll_type;
+			__le16 pll_reg;
+		};
+
+		/* For any general request */
+		__le32 index;
+
 		/* For frequency get/set */
 		__le32 pll_index;

 		/* For led set */
 		__le32 led_index;

-		/* For get Armcp info/EEPROM data */
+		/* For get CpuCP info/EEPROM data */
 		__le32 data_max_size;
 	};

 	__le32 reserved;
 };

-struct armcp_unmask_irq_arr_packet {
-	struct armcp_packet armcp_pkt;
+struct cpucp_unmask_irq_arr_packet {
+	struct cpucp_packet cpucp_pkt;
 	__le32 length;
 	__le32 irqs[0];
 };

-enum armcp_packet_rc {
-	armcp_packet_success,
-	armcp_packet_invalid,
-	armcp_packet_fault
+enum cpucp_packet_rc {
+	cpucp_packet_success,
+	cpucp_packet_invalid,
+	cpucp_packet_fault
 };

 /*
- * armcp_temp_type should adhere to hwmon_temp_attributes
+ * cpucp_temp_type should adhere to hwmon_temp_attributes
 * defined in Linux kernel hwmon.h file
 */
-enum armcp_temp_type {
-	armcp_temp_input,
-	armcp_temp_max = 6,
-	armcp_temp_max_hyst,
-	armcp_temp_crit,
-	armcp_temp_crit_hyst,
-	armcp_temp_offset = 19,
-	armcp_temp_highest = 22,
-	armcp_temp_reset_history = 23
+enum cpucp_temp_type {
+	cpucp_temp_input,
+	cpucp_temp_max = 6,
+	cpucp_temp_max_hyst,
+	cpucp_temp_crit,
+	cpucp_temp_crit_hyst,
+	cpucp_temp_offset = 19,
+	cpucp_temp_highest = 22,
+	cpucp_temp_reset_history = 23
 };

-enum armcp_in_attributes {
-	armcp_in_input,
-	armcp_in_min,
-	armcp_in_max,
-	armcp_in_highest = 7,
-	armcp_in_reset_history
+enum cpucp_in_attributes {
+	cpucp_in_input,
+	cpucp_in_min,
+	cpucp_in_max,
+	cpucp_in_highest = 7,
+	cpucp_in_reset_history
 };

-enum armcp_curr_attributes {
-	armcp_curr_input,
-	armcp_curr_min,
-	armcp_curr_max,
-	armcp_curr_highest = 7,
-	armcp_curr_reset_history
+enum cpucp_curr_attributes {
+	cpucp_curr_input,
+	cpucp_curr_min,
+	cpucp_curr_max,
+	cpucp_curr_highest = 7,
+	cpucp_curr_reset_history
 };

-enum armcp_fan_attributes {
-	armcp_fan_input,
-	armcp_fan_min = 2,
-	armcp_fan_max
+enum cpucp_fan_attributes {
+	cpucp_fan_input,
+	cpucp_fan_min = 2,
+	cpucp_fan_max
 };

-enum armcp_pwm_attributes {
-	armcp_pwm_input,
-	armcp_pwm_enable
+enum cpucp_pwm_attributes {
+	cpucp_pwm_input,
+	cpucp_pwm_enable
+};
+
+enum cpucp_pcie_throughput_attributes {
+	cpucp_pcie_throughput_tx,
+	cpucp_pcie_throughput_rx
+};
+
+enum cpucp_pll_reg_attributes {
+	cpucp_pll_nr_reg,
+	cpucp_pll_nf_reg,
+	cpucp_pll_od_reg,
+	cpucp_pll_div_factor_reg,
+	cpucp_pll_div_sel_reg
+};
+
+enum cpucp_pll_type_attributes {
+	cpucp_pll_cpu,
+	cpucp_pll_pci,
 };

 /* Event Queue Packets */
@ -351,32 +386,32 @@ struct eq_generic_event {
 };

 /*
- * ArmCP info
+ * CpuCP info
 */

 #define CARD_NAME_MAX_LEN		16
 #define VERSION_MAX_LEN			128
-#define ARMCP_MAX_SENSORS		128
+#define CPUCP_MAX_SENSORS		128

-struct armcp_sensor {
+struct cpucp_sensor {
 	__le32 type;
 	__le32 flags;
 };

 /**
- * struct armcp_card_types - ASIC card type.
- * @armcp_card_type_pci: PCI card.
- * @armcp_card_type_pmc: PCI Mezzanine Card.
+ * struct cpucp_card_types - ASIC card type.
+ * @cpucp_card_type_pci: PCI card.
+ * @cpucp_card_type_pmc: PCI Mezzanine Card.
 */
-enum armcp_card_types {
-	armcp_card_type_pci,
-	armcp_card_type_pmc
+enum cpucp_card_types {
+	cpucp_card_type_pci,
+	cpucp_card_type_pmc
 };

 /**
- * struct armcp_info - Info from ArmCP that is necessary to the host's driver
+ * struct cpucp_info - Info from CpuCP that is necessary to the host's driver
 * @sensors: available sensors description.
- * @kernel_version: ArmCP linux kernel version.
+ * @kernel_version: CpuCP linux kernel version.
 * @reserved: reserved field.
 * @card_type: card configuration type.
 * @card_location: in a server, each card has different connections topology
@ -385,12 +420,12 @@ enum armcp_card_types {
 * @infineon_version: Infineon main DC-DC version.
 * @fuse_version: silicon production FUSE information.
 * @thermal_version: thermald S/W version.
- * @armcp_version: ArmCP S/W version.
+ * @cpucp_version: CpuCP S/W version.
 * @dram_size: available DRAM size.
 * @card_name: card name that will be displayed in HWMON subsystem on the host
 */
-struct armcp_info {
-	struct armcp_sensor sensors[ARMCP_MAX_SENSORS];
+struct cpucp_info {
+	struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
 	__u8 kernel_version[VERSION_MAX_LEN];
 	__le32 reserved;
 	__le32 card_type;
@ -399,9 +434,10 @@ struct armcp_info {
 	__le32 infineon_version;
 	__u8 fuse_version[VERSION_MAX_LEN];
 	__u8 thermal_version[VERSION_MAX_LEN];
-	__u8 armcp_version[VERSION_MAX_LEN];
+	__u8 cpucp_version[VERSION_MAX_LEN];
+	__le32 reserved2;
 	__le64 dram_size;
 	char card_name[CARD_NAME_MAX_LEN];
 };

-#endif /* ARMCP_IF_H */
+#endif /* CPUCP_IF_H */
--- a/drivers/misc/habanalabs/include/common/qman_if.h
+++ b/drivers/misc/habanalabs/include/common/qman_if.h
@ -40,7 +40,7 @@ struct hl_bd {
 */

 #define BD_CTL_COMP_OFFSET_SHIFT	16
-#define BD_CTL_COMP_OFFSET_MASK		0x00FF0000
+#define BD_CTL_COMP_OFFSET_MASK		0x0FFF0000

 #define BD_CTL_COMP_DATA_SHIFT		0
 #define BD_CTL_COMP_DATA_MASK		0x0000FFFF
--- a/drivers/misc/habanalabs/include/gaudi/gaudi.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi.h
@ -44,6 +44,8 @@

 #define MME_NUMBER_OF_MASTER_ENGINES	2

+#define MME_NUMBER_OF_SLAVE_ENGINES	2
+
 #define TPC_NUMBER_OF_ENGINES	8

 #define DMA_NUMBER_OF_CHANNELS	8
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
@ -12,191 +12,160 @@

 /* Useful masks for bits in various registers */
 #define PCI_DMA_QMAN_ENABLE		(\
-	(0xF << DMA0_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
-	(0xF << DMA0_QM_GLBL_CFG0_CQF_EN_SHIFT) | \
-	(0xF << DMA0_QM_GLBL_CFG0_CP_EN_SHIFT))
+	(FIELD_PREP(DMA0_QM_GLBL_CFG0_PQF_EN_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_CFG0_CQF_EN_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_CFG0_CP_EN_MASK, 0xF)))

 #define QMAN_EXTERNAL_MAKE_TRUSTED	(\
-	(0xF << DMA0_QM_GLBL_PROT_PQF_SHIFT) | \
-	(0xF << DMA0_QM_GLBL_PROT_CQF_SHIFT) | \
-	(0xF << DMA0_QM_GLBL_PROT_CP_SHIFT) | \
-	(0x1 << DMA0_QM_GLBL_PROT_ERR_SHIFT))
+	(FIELD_PREP(DMA0_QM_GLBL_PROT_PQF_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_PROT_CQF_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_PROT_CP_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_PROT_ERR_MASK, 0x1)))

 #define QMAN_INTERNAL_MAKE_TRUSTED	(\
-	(0xF << DMA0_QM_GLBL_PROT_PQF_SHIFT) | \
-	(0x1 << DMA0_QM_GLBL_PROT_ERR_SHIFT))
+	(FIELD_PREP(DMA0_QM_GLBL_PROT_PQF_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_PROT_ERR_MASK, 0x1)))

 #define HBM_DMA_QMAN_ENABLE		(\
-	(0xF << DMA0_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
-	(0x1F << DMA0_QM_GLBL_CFG0_CQF_EN_SHIFT) | \
-	(0x1F << DMA0_QM_GLBL_CFG0_CP_EN_SHIFT))
+	(FIELD_PREP(DMA0_QM_GLBL_CFG0_PQF_EN_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_CFG0_CQF_EN_MASK, 0x1F)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_CFG0_CP_EN_MASK, 0x1F)))

 #define QMAN_MME_ENABLE		(\
-	(0xF << MME0_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
-	(0x1F << MME0_QM_GLBL_CFG0_CQF_EN_SHIFT) | \
-	(0x1F << MME0_QM_GLBL_CFG0_CP_EN_SHIFT))
+	(FIELD_PREP(MME0_QM_GLBL_CFG0_PQF_EN_MASK, 0xF)) | \
+	(FIELD_PREP(MME0_QM_GLBL_CFG0_CQF_EN_MASK, 0x1F)) | \
+	(FIELD_PREP(MME0_QM_GLBL_CFG0_CP_EN_MASK, 0x1F)))

 #define QMAN_TPC_ENABLE		(\
-	(0xF << TPC0_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
-	(0x1F << TPC0_QM_GLBL_CFG0_CQF_EN_SHIFT) | \
-	(0x1F << TPC0_QM_GLBL_CFG0_CP_EN_SHIFT))
+	(FIELD_PREP(TPC0_QM_GLBL_CFG0_PQF_EN_MASK, 0xF)) | \
+	(FIELD_PREP(TPC0_QM_GLBL_CFG0_CQF_EN_MASK, 0x1F)) | \
+	(FIELD_PREP(TPC0_QM_GLBL_CFG0_CP_EN_MASK, 0x1F)))

 #define QMAN_UPPER_CP_CGM_PWR_GATE_EN	(\
-	(0x20 << DMA0_QM_CGM_CFG_IDLE_TH_SHIFT) | \
-	(0xA << DMA0_QM_CGM_CFG_G2F_TH_SHIFT) | \
-	(0x10 << DMA0_QM_CGM_CFG_CP_IDLE_MASK_SHIFT) | \
-	(1 << DMA0_QM_CGM_CFG_EN_SHIFT))
+	(FIELD_PREP(DMA0_QM_CGM_CFG_IDLE_TH_MASK, 0x20)) | \
+	(FIELD_PREP(DMA0_QM_CGM_CFG_G2F_TH_MASK, 0xA)) | \
+	(FIELD_PREP(DMA0_QM_CGM_CFG_CP_IDLE_MASK_MASK, 0x10)) | \
+	(FIELD_PREP(DMA0_QM_CGM_CFG_EN_MASK, 0x1)))

 #define QMAN_COMMON_CP_CGM_PWR_GATE_EN	(\
-	(0x20 << DMA0_QM_CGM_CFG_IDLE_TH_SHIFT) | \
-	(0xA << DMA0_QM_CGM_CFG_G2F_TH_SHIFT) | \
-	(0xF << DMA0_QM_CGM_CFG_CP_IDLE_MASK_SHIFT) | \
-	(1 << DMA0_QM_CGM_CFG_EN_SHIFT))
+	(FIELD_PREP(DMA0_QM_CGM_CFG_IDLE_TH_MASK, 0x20)) | \
+	(FIELD_PREP(DMA0_QM_CGM_CFG_G2F_TH_MASK, 0xA)) | \
+	(FIELD_PREP(DMA0_QM_CGM_CFG_CP_IDLE_MASK_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_CGM_CFG_EN_MASK, 0x1)))

 #define PCI_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK	(\
-	(0xF << DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_SHIFT) | \
-	(0xF << DMA0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_SHIFT) | \
-	(0xF << DMA0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_SHIFT))
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_MASK, 0xF)))

 #define PCI_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
-	(0xF << DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
-	(0xF << DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
-	(0xF << DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)))

 #define HBM_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK	(\
-	(0xF << DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_SHIFT) | \
-	(0x1F << DMA0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_SHIFT) | \
-	(0x1F << DMA0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_SHIFT))
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_MASK, 0x1F)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_MASK, 0x1F)))

 #define HBM_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
-	(0xF << DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
-	(0x1F << DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
-	(0x1F << DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
+	(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))

 #define TPC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK	(\
-	(0xF << TPC0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_SHIFT) | \
-	(0x1F << TPC0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_SHIFT) | \
-	(0x1F << TPC0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_SHIFT))
+	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
+	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_MASK, 0x1F)) | \
+	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_MASK, 0x1F)))

 #define TPC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
-	(0xF << TPC0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
-	(0x1F << TPC0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
-	(0x1F << TPC0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
+	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
+	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
+	(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))

 #define MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK	(\
-	(0xF << MME0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_SHIFT) | \
-	(0x1F << MME0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_SHIFT) | \
-	(0x1F << MME0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_SHIFT))
+	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
+	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_MASK, 0x1F)) | \
+	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_MASK, 0x1F)))

 #define MME_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK	(\
-	(0xF << MME0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
-	(0x1F << MME0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
-	(0x1F << MME0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
+	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
+	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
+	(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))

-#define QMAN_CGM1_PWR_GATE_EN	(0xA << DMA0_QM_CGM_CFG1_MASK_TH_SHIFT)
+#define QMAN_CGM1_PWR_GATE_EN	(FIELD_PREP(DMA0_QM_CGM_CFG1_MASK_TH_MASK, 0xA))

 /* RESET registers configuration */
-#define CFG_RST_L_PSOC_SHIFT		0
-#define CFG_RST_L_PCIE_SHIFT		1
-#define CFG_RST_L_PCIE_IF_SHIFT		2
-#define CFG_RST_L_HBM_S_PLL_SHIFT	3
-#define CFG_RST_L_TPC_S_PLL_SHIFT	4
-#define CFG_RST_L_MME_S_PLL_SHIFT	5
-#define CFG_RST_L_CPU_PLL_SHIFT		6
-#define CFG_RST_L_PCIE_PLL_SHIFT	7
-#define CFG_RST_L_NIC_S_PLL_SHIFT	8
-#define CFG_RST_L_HBM_N_PLL_SHIFT	9
-#define CFG_RST_L_TPC_N_PLL_SHIFT	10
-#define CFG_RST_L_MME_N_PLL_SHIFT	11
-#define CFG_RST_L_NIC_N_PLL_SHIFT	12
-#define CFG_RST_L_DMA_W_PLL_SHIFT	13
-#define CFG_RST_L_SIF_W_PLL_SHIFT	14
-#define CFG_RST_L_MESH_W_PLL_SHIFT	15
-#define CFG_RST_L_SRAM_W_PLL_SHIFT	16
-#define CFG_RST_L_DMA_E_PLL_SHIFT	17
-#define CFG_RST_L_SIF_E_PLL_SHIFT	18
-#define CFG_RST_L_MESH_E_PLL_SHIFT	19
-#define CFG_RST_L_SRAM_E_PLL_SHIFT	20
-#define CFG_RST_L_IF_1_SHIFT		21
-#define CFG_RST_L_IF_0_SHIFT		22
-#define CFG_RST_L_IF_2_SHIFT		23
-#define CFG_RST_L_IF_3_SHIFT		24
-#define CFG_RST_L_TPC_0_SHIFT		25
-#define CFG_RST_L_TPC_1_SHIFT		26
-#define CFG_RST_L_TPC_2_SHIFT		27
-#define CFG_RST_L_TPC_3_SHIFT		28
-#define CFG_RST_L_TPC_4_SHIFT		29
-#define CFG_RST_L_TPC_5_SHIFT		30
-#define CFG_RST_L_TPC_6_SHIFT		31
-#define CFG_RST_H_TPC_7_SHIFT		0
-#define CFG_RST_H_MME_0_SHIFT		1
-#define CFG_RST_H_MME_1_SHIFT		2
-#define CFG_RST_H_MME_2_SHIFT		3
-#define CFG_RST_H_MME_3_SHIFT		4
-#define CFG_RST_H_HBM_0_SHIFT		5
-#define CFG_RST_H_HBM_1_SHIFT		6
-#define CFG_RST_H_HBM_2_SHIFT		7
-#define CFG_RST_H_HBM_3_SHIFT		8
-#define CFG_RST_H_NIC_0_SHIFT		9
-#define CFG_RST_H_NIC_1_SHIFT		10
-#define CFG_RST_H_NIC_2_SHIFT		11
-#define CFG_RST_H_NIC_3_SHIFT		12
-#define CFG_RST_H_NIC_4_SHIFT		13
-#define CFG_RST_H_SM_0_SHIFT		14
-#define CFG_RST_H_SM_1_SHIFT		15
-#define CFG_RST_H_SM_2_SHIFT		16
-#define CFG_RST_H_SM_3_SHIFT		17
-#define CFG_RST_H_DMA_0_SHIFT		18
-#define CFG_RST_H_DMA_1_SHIFT		19
-#define CFG_RST_H_CPU_SHIFT		20
-#define CFG_RST_H_MMU_SHIFT		21
+#define CFG_RST_L_PSOC_MASK		BIT_MASK(0)
+#define CFG_RST_L_PCIE_MASK		BIT_MASK(1)
+#define CFG_RST_L_PCIE_IF_MASK		BIT_MASK(2)
+#define CFG_RST_L_HBM_S_PLL_MASK	BIT_MASK(3)
+#define CFG_RST_L_TPC_S_PLL_MASK	BIT_MASK(4)
+#define CFG_RST_L_MME_S_PLL_MASK	BIT_MASK(5)
+#define CFG_RST_L_CPU_PLL_MASK		BIT_MASK(6)
+#define CFG_RST_L_PCIE_PLL_MASK		BIT_MASK(7)
+#define CFG_RST_L_NIC_S_PLL_MASK	BIT_MASK(8)
+#define CFG_RST_L_HBM_N_PLL_MASK	BIT_MASK(9)
+#define CFG_RST_L_TPC_N_PLL_MASK	BIT_MASK(10)
+#define CFG_RST_L_MME_N_PLL_MASK	BIT_MASK(11)
+#define CFG_RST_L_NIC_N_PLL_MASK	BIT_MASK(12)
+#define CFG_RST_L_DMA_W_PLL_MASK	BIT_MASK(13)
+#define CFG_RST_L_SIF_W_PLL_MASK	BIT_MASK(14)
+#define CFG_RST_L_MESH_W_PLL_MASK	BIT_MASK(15)
+#define CFG_RST_L_SRAM_W_PLL_MASK	BIT_MASK(16)
+#define CFG_RST_L_DMA_E_PLL_MASK	BIT_MASK(17)
+#define CFG_RST_L_SIF_E_PLL_MASK	BIT_MASK(18)
+#define CFG_RST_L_MESH_E_PLL_MASK	BIT_MASK(19)
+#define CFG_RST_L_SRAM_E_PLL_MASK	BIT_MASK(20)

+#define CFG_RST_L_IF_1_MASK		BIT_MASK(21)
+#define CFG_RST_L_IF_0_MASK		BIT_MASK(22)
+#define CFG_RST_L_IF_2_MASK		BIT_MASK(23)
+#define CFG_RST_L_IF_3_MASK		BIT_MASK(24)
+#define CFG_RST_L_IF_MASK		GENMASK(24, 21)

-#define CFG_RST_H_DMA_MASK		((1 << CFG_RST_H_DMA_0_SHIFT) | \
-					(1 << CFG_RST_H_DMA_1_SHIFT))
+#define CFG_RST_L_TPC_0_MASK		BIT_MASK(25)
+#define CFG_RST_L_TPC_1_MASK		BIT_MASK(26)
+#define CFG_RST_L_TPC_2_MASK		BIT_MASK(27)
+#define CFG_RST_L_TPC_3_MASK		BIT_MASK(28)
+#define CFG_RST_L_TPC_4_MASK		BIT_MASK(29)
+#define CFG_RST_L_TPC_5_MASK		BIT_MASK(30)
+#define CFG_RST_L_TPC_6_MASK		BIT_MASK(31)
+#define CFG_RST_L_TPC_MASK		GENMASK(31, 25)

-#define CFG_RST_H_CPU_MASK		(1 << CFG_RST_H_CPU_SHIFT)
-#define CFG_RST_H_MMU_MASK		(1 << CFG_RST_H_MMU_SHIFT)
+#define CFG_RST_H_TPC_7_MASK		BIT_MASK(0)

-#define CFG_RST_H_HBM_MASK		((1 << CFG_RST_H_HBM_0_SHIFT) | \
-					(1 << CFG_RST_H_HBM_1_SHIFT) | \
-					(1 << CFG_RST_H_HBM_2_SHIFT) | \
-					(1 << CFG_RST_H_HBM_3_SHIFT))
+#define CFG_RST_H_MME_0_MASK		BIT_MASK(1)
+#define CFG_RST_H_MME_1_MASK		BIT_MASK(2)
+#define CFG_RST_H_MME_2_MASK		BIT_MASK(3)
+#define CFG_RST_H_MME_3_MASK		BIT_MASK(4)
+#define CFG_RST_H_MME_MASK		GENMASK(4, 1)

-#define CFG_RST_H_NIC_MASK		((1 << CFG_RST_H_NIC_0_SHIFT) | \
-					(1 << CFG_RST_H_NIC_1_SHIFT) | \
-					(1 << CFG_RST_H_NIC_2_SHIFT) | \
-					(1 << CFG_RST_H_NIC_3_SHIFT) | \
-					(1 << CFG_RST_H_NIC_4_SHIFT))
+#define CFG_RST_H_HBM_0_MASK		BIT_MASK(5)
+#define CFG_RST_H_HBM_1_MASK		BIT_MASK(6)
+#define CFG_RST_H_HBM_2_MASK		BIT_MASK(7)
+#define CFG_RST_H_HBM_3_MASK		BIT_MASK(8)
+#define CFG_RST_H_HBM_MASK		GENMASK(8, 5)

-#define CFG_RST_H_SM_MASK		((1 << CFG_RST_H_SM_0_SHIFT) | \
-					(1 << CFG_RST_H_SM_1_SHIFT) | \
-					(1 << CFG_RST_H_SM_2_SHIFT) | \
-					(1 << CFG_RST_H_SM_3_SHIFT))
+#define CFG_RST_H_NIC_0_MASK		BIT_MASK(9)
+#define CFG_RST_H_NIC_1_MASK		BIT_MASK(10)
+#define CFG_RST_H_NIC_2_MASK		BIT_MASK(11)
+#define CFG_RST_H_NIC_3_MASK		BIT_MASK(12)
+#define CFG_RST_H_NIC_4_MASK		BIT_MASK(13)
+#define CFG_RST_H_NIC_MASK		GENMASK(13, 9)

-#define CFG_RST_H_MME_MASK		((1 << CFG_RST_H_MME_0_SHIFT) | \
-					(1 << CFG_RST_H_MME_1_SHIFT) | \
-					(1 << CFG_RST_H_MME_2_SHIFT) | \
-					(1 << CFG_RST_H_MME_3_SHIFT))
+#define CFG_RST_H_SM_0_MASK		BIT_MASK(14)
+#define CFG_RST_H_SM_1_MASK		BIT_MASK(15)
+#define CFG_RST_H_SM_2_MASK		BIT_MASK(16)
+#define CFG_RST_H_SM_3_MASK		BIT_MASK(17)
+#define CFG_RST_H_SM_MASK		GENMASK(17, 14)

-#define CFG_RST_L_PSOC_MASK		(1 << CFG_RST_L_PSOC_SHIFT)
+#define CFG_RST_H_DMA_0_MASK		BIT_MASK(18)
+#define CFG_RST_H_DMA_1_MASK		BIT_MASK(19)
+#define CFG_RST_H_DMA_MASK		GENMASK(19, 18)

-#define CFG_RST_L_IF_MASK		((1 << CFG_RST_L_IF_0_SHIFT) | \
-					(1 << CFG_RST_L_IF_1_SHIFT) | \
-					(1 << CFG_RST_L_IF_2_SHIFT) | \
-					(1 << CFG_RST_L_IF_3_SHIFT))
-
-#define CFG_RST_L_TPC_MASK		((1 << CFG_RST_L_TPC_0_SHIFT) | \
-					(1 << CFG_RST_L_TPC_1_SHIFT) | \
-					(1 << CFG_RST_L_TPC_2_SHIFT) | \
-					(1 << CFG_RST_L_TPC_3_SHIFT) | \
-					(1 << CFG_RST_L_TPC_4_SHIFT) | \
-					(1 << CFG_RST_L_TPC_5_SHIFT) | \
-					(1 << CFG_RST_L_TPC_6_SHIFT))
-
-#define CFG_RST_H_TPC_MASK		(1 << CFG_RST_H_TPC_7_SHIFT)
-
-#define CA53_RESET			(1 << CFG_RST_H_CPU_SHIFT)
+#define CFG_RST_H_CPU_MASK		BIT_MASK(20)
+#define CFG_RST_H_MMU_MASK		BIT_MASK(21)

 #define UNIT_RST_L_PSOC_SHIFT		0
 #define UNIT_RST_L_PCIE_SHIFT		1
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
@ -12,6 +12,7 @@
 * PSOC scratch-pad registers
 */
 #define mmHW_STATE			mmPSOC_GLOBAL_CONF_SCRATCHPAD_0
+#define mmFUSE_VER_OFFSET		mmPSOC_GLOBAL_CONF_SCRATCHPAD_22
 #define mmCPU_CMD_STATUS_TO_HOST	mmPSOC_GLOBAL_CONF_SCRATCHPAD_23
 #define mmCPU_BOOT_ERR0			mmPSOC_GLOBAL_CONF_SCRATCHPAD_24
 #define mmCPU_BOOT_ERR1			mmPSOC_GLOBAL_CONF_SCRATCHPAD_25
--- a/drivers/misc/habanalabs/include/goya/goya_reg_map.h
+++ b/drivers/misc/habanalabs/include/goya/goya_reg_map.h
@ -22,6 +22,7 @@
 #define mmCPU_CQ_BASE_ADDR_LOW		mmPSOC_GLOBAL_CONF_SCRATCHPAD_8
 #define mmCPU_CQ_BASE_ADDR_HIGH		mmPSOC_GLOBAL_CONF_SCRATCHPAD_9
 #define mmCPU_CQ_LENGTH			mmPSOC_GLOBAL_CONF_SCRATCHPAD_10
+#define mmFUSE_VER_OFFSET		mmPSOC_GLOBAL_CONF_SCRATCHPAD_22
 #define mmCPU_CMD_STATUS_TO_HOST	mmPSOC_GLOBAL_CONF_SCRATCHPAD_23
 #define mmCPU_BOOT_ERR0			mmPSOC_GLOBAL_CONF_SCRATCHPAD_24
 #define mmCPU_BOOT_ERR1			mmPSOC_GLOBAL_CONF_SCRATCHPAD_25
--- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
@ -29,6 +29,8 @@
 #define HOP3_SHIFT			21
 #define HOP4_SHIFT			12

+#define MMU_ARCH_5_HOPS			5
+
 #define HOP_PHYS_ADDR_MASK		(~FLAGS_MASK)

 #define HL_PTE_SIZE			sizeof(u64)
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@ -264,6 +264,10 @@ enum hl_device_status {
 * HL_INFO_TIME_SYNC     - Retrieve the device's time alongside the host's time
 *                         for synchronization.
 * HL_INFO_CS_COUNTERS   - Retrieve command submission counters
+ * HL_INFO_PCI_COUNTERS  - Retrieve PCI counters
+ * HL_INFO_CLK_THROTTLE_REASON - Retrieve clock throttling reason
+ * HL_INFO_SYNC_MANAGER  - Retrieve sync manager info per dcore
+ * HL_INFO_TOTAL_ENERGY  - Retrieve total energy consumption
 */
 #define HL_INFO_HW_IP_INFO		0
 #define HL_INFO_HW_EVENTS		1
@ -276,6 +280,10 @@ enum hl_device_status {
 #define HL_INFO_RESET_COUNT		9
 #define HL_INFO_TIME_SYNC		10
 #define HL_INFO_CS_COUNTERS		11
+#define HL_INFO_PCI_COUNTERS		12
+#define HL_INFO_CLK_THROTTLE_REASON	13
+#define HL_INFO_SYNC_MANAGER		14
+#define HL_INFO_TOTAL_ENERGY		15

 #define HL_INFO_VERSION_MAX_LEN	128
 #define HL_INFO_CARD_NAME_MAX_LEN	16
@ -289,7 +297,7 @@ struct hl_info_hw_ip_info {
 	__u32 device_id; /* PCI Device ID */
 	__u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */
 	__u32 reserved[2];
-	__u32 armcp_cpld_version;
+	__u32 cpld_version;
 	__u32 psoc_pci_pll_nr;
 	__u32 psoc_pci_pll_nf;
 	__u32 psoc_pci_pll_od;
@ -297,7 +305,7 @@ struct hl_info_hw_ip_info {
 	__u8 tpc_enabled_mask;
 	__u8 dram_enabled;
 	__u8 pad[2];
-	__u8 armcp_version[HL_INFO_VERSION_MAX_LEN];
+	__u8 cpucp_version[HL_INFO_VERSION_MAX_LEN];
 	__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
 };

@ -313,6 +321,12 @@ struct hl_info_hw_idle {
 	 * Bits definition is according to `enum <chip>_enging_id'.
 	 */
 	__u32 busy_engines_mask;
+
+	/*
+	 * Extended Bitmask of busy engines.
+	 * Bits definition is according to `enum <chip>_enging_id'.
+	 */
+	__u64 busy_engines_mask_ext;
 };

 struct hl_info_device_status {
@ -340,18 +354,61 @@ struct hl_info_time_sync {
 	__u64 host_time;
 };

+/**
+ * struct hl_info_pci_counters - pci counters
+ * @rx_throughput: PCI rx throughput KBps
+ * @tx_throughput: PCI tx throughput KBps
+ * @replay_cnt: PCI replay counter
+ */
+struct hl_info_pci_counters {
+	__u64 rx_throughput;
+	__u64 tx_throughput;
+	__u64 replay_cnt;
+};
+
+#define HL_CLK_THROTTLE_POWER	0x1
+#define HL_CLK_THROTTLE_THERMAL	0x2
+
+/**
+ * struct hl_info_clk_throttle - clock throttling reason
+ * @clk_throttling_reason: each bit represents a clk throttling reason
+ */
+struct hl_info_clk_throttle {
+	__u32 clk_throttling_reason;
+};
+
+/**
+ * struct hl_info_energy - device energy information
+ * @total_energy_consumption: total device energy consumption
+ */
+struct hl_info_energy {
+	__u64 total_energy_consumption;
+};
+
+/**
+ * struct hl_info_sync_manager - sync manager information
+ * @first_available_sync_object: first available sob
+ * @first_available_monitor: first available monitor
+ */
+struct hl_info_sync_manager {
+	__u32 first_available_sync_object;
+	__u32 first_available_monitor;
+};
+
 /**
 * struct hl_info_cs_counters - command submission counters
 * @out_of_mem_drop_cnt: dropped due to memory allocation issue
 * @parsing_drop_cnt: dropped due to error in packet parsing
 * @queue_full_drop_cnt: dropped due to queue full
 * @device_in_reset_drop_cnt: dropped due to device in reset
+ * @max_cs_in_flight_drop_cnt: dropped due to maximum CS in-flight
 */
 struct hl_cs_counters {
 	__u64 out_of_mem_drop_cnt;
 	__u64 parsing_drop_cnt;
 	__u64 queue_full_drop_cnt;
 	__u64 device_in_reset_drop_cnt;
+	__u64 max_cs_in_flight_drop_cnt;
 };

 struct hl_info_cs_counters {
@ -359,6 +416,13 @@ struct hl_info_cs_counters {
 	struct hl_cs_counters ctx_cs_counters;
 };

+enum gaudi_dcores {
+	HL_GAUDI_WS_DCORE,
+	HL_GAUDI_WN_DCORE,
+	HL_GAUDI_EN_DCORE,
+	HL_GAUDI_ES_DCORE
+};
+
 struct hl_info_args {
 	/* Location of relevant struct in userspace */
 	__u64 return_pointer;
@ -375,6 +439,10 @@ struct hl_info_args {
 	__u32 op;

 	union {
+		/* Dcore id for which the information is relevant.
+		 * For Gaudi refer to 'enum gaudi_dcores'
+		 */
+		__u32 dcore_id;
 		/* Context ID - Currently not in use */
 		__u32 ctx_id;
 		/* Period value for utilization rate (100ms - 1000ms, in 100ms
@ -394,6 +462,9 @@ struct hl_info_args {
 /* 2MB minus 32 bytes for 2xMSG_PROT */
 #define HL_MAX_CB_SIZE		(0x200000 - 32)

+/* Indicates whether the command buffer should be mapped to the device's MMU */
+#define HL_CB_FLAGS_MAP		0x1
+
 struct hl_cb_in {
 	/* Handle of CB or 0 if we want to create one */
 	__u64 cb_handle;
@ -405,7 +476,8 @@ struct hl_cb_in {
 	__u32 cb_size;
 	/* Context ID - Currently not in use */
 	__u32 ctx_id;
-	__u32 pad;
+	/* HL_CB_FLAGS_* */
+	__u32 flags;
 };

 struct hl_cb_out {
@ -788,6 +860,12 @@ struct hl_debug_args {
 * When creating a new CB, the IOCTL returns a handle of it, and the user-space
 * process needs to use that handle to mmap the buffer so it can access them.
 *
+ * In some instances, the device must access the command buffer through the
+ * device's MMU, and thus its memory should be mapped. In these cases, user can
+ * indicate the driver that such a mapping is required.
+ * The resulting device virtual address will be used internally by the driver,
+ * and won't be returned to user.
+ *
 */
 #define HL_IOCTL_CB		\
 		_IOWR('H', 0x02, union hl_cb_args)
@ -846,6 +924,9 @@ struct hl_debug_args {
 * inside the kernel until the CS has finished or until the user-requested
 * timeout has expired.
 *
+ * If the timeout value is 0, the driver won't sleep at all. It will check
+ * the status of the CS and return immediately
+ *
 * The return value of the IOCTL is a standard Linux error code. The possible
 * values are:
 *