From 7fc40bcaa63127d274e926dc1e9d62a72a01b1b5 Mon Sep 17 00:00:00 2001 From: Pawel Piskorski Date: Fri, 6 Dec 2019 17:32:38 +0200 Subject: [PATCH] habanalabs: flush only at the end of the map/unmap Optimize hl_mmu_map and hl_mmu_unmap by not calling flush(ctx) within per-page loop. Signed-off-by: Pawel Piskorski Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 20 +++++++------ drivers/misc/habanalabs/habanalabs.h | 6 ++-- drivers/misc/habanalabs/memory.c | 9 ++++-- drivers/misc/habanalabs/mmu.c | 42 ++++++++++++++++++---------- 4 files changed, 50 insertions(+), 27 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index b8a8de24aaf7..3c6794883db1 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4776,7 +4776,8 @@ static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev) for (off = 0 ; off < CPU_FW_IMAGE_SIZE ; off += PAGE_SIZE_2MB) { rc = hl_mmu_map(hdev->kernel_ctx, prop->dram_base_address + off, - prop->dram_base_address + off, PAGE_SIZE_2MB); + prop->dram_base_address + off, PAGE_SIZE_2MB, + (off + PAGE_SIZE_2MB) == CPU_FW_IMAGE_SIZE); if (rc) { dev_err(hdev->dev, "Map failed for address 0x%llx\n", prop->dram_base_address + off); @@ -4786,7 +4787,7 @@ static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev) if (!(hdev->cpu_accessible_dma_address & (PAGE_SIZE_2MB - 1))) { rc = hl_mmu_map(hdev->kernel_ctx, VA_CPU_ACCESSIBLE_MEM_ADDR, - hdev->cpu_accessible_dma_address, PAGE_SIZE_2MB); + hdev->cpu_accessible_dma_address, PAGE_SIZE_2MB, true); if (rc) { dev_err(hdev->dev, @@ -4799,7 +4800,7 @@ static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev) rc = hl_mmu_map(hdev->kernel_ctx, VA_CPU_ACCESSIBLE_MEM_ADDR + cpu_off, hdev->cpu_accessible_dma_address + cpu_off, - PAGE_SIZE_4KB); + PAGE_SIZE_4KB, true); if (rc) { dev_err(hdev->dev, "Map failed for CPU accessible memory\n"); @@ -4825,14 +4826,15 @@ static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev) for (; cpu_off >= 0 ; cpu_off -= PAGE_SIZE_4KB) if (hl_mmu_unmap(hdev->kernel_ctx, VA_CPU_ACCESSIBLE_MEM_ADDR + cpu_off, - PAGE_SIZE_4KB)) + PAGE_SIZE_4KB, true)) dev_warn_ratelimited(hdev->dev, "failed to unmap address 0x%llx\n", VA_CPU_ACCESSIBLE_MEM_ADDR + cpu_off); unmap: for (; off >= 0 ; off -= PAGE_SIZE_2MB) if (hl_mmu_unmap(hdev->kernel_ctx, - prop->dram_base_address + off, PAGE_SIZE_2MB)) + prop->dram_base_address + off, PAGE_SIZE_2MB, + true)) dev_warn_ratelimited(hdev->dev, "failed to unmap address 0x%llx\n", prop->dram_base_address + off); @@ -4857,14 +4859,15 @@ void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev) if (!(hdev->cpu_accessible_dma_address & (PAGE_SIZE_2MB - 1))) { if (hl_mmu_unmap(hdev->kernel_ctx, VA_CPU_ACCESSIBLE_MEM_ADDR, - PAGE_SIZE_2MB)) + PAGE_SIZE_2MB, true)) dev_warn(hdev->dev, "Failed to unmap CPU accessible memory\n"); } else { for (cpu_off = 0 ; cpu_off < SZ_2M ; cpu_off += PAGE_SIZE_4KB) if (hl_mmu_unmap(hdev->kernel_ctx, VA_CPU_ACCESSIBLE_MEM_ADDR + cpu_off, - PAGE_SIZE_4KB)) + PAGE_SIZE_4KB, + (cpu_off + PAGE_SIZE_4KB) >= SZ_2M)) dev_warn_ratelimited(hdev->dev, "failed to unmap address 0x%llx\n", VA_CPU_ACCESSIBLE_MEM_ADDR + cpu_off); @@ -4872,7 +4875,8 @@ void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev) for (off = 0 ; off < CPU_FW_IMAGE_SIZE ; off += PAGE_SIZE_2MB) if (hl_mmu_unmap(hdev->kernel_ctx, - prop->dram_base_address + off, PAGE_SIZE_2MB)) + prop->dram_base_address + off, PAGE_SIZE_2MB, + (off + PAGE_SIZE_2MB) >= CPU_FW_IMAGE_SIZE)) dev_warn_ratelimited(hdev->dev, "Failed to unmap address 0x%llx\n", prop->dram_base_address + off); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 00c949f4ccd1..df34227dea31 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -1573,8 +1573,10 @@ int hl_mmu_init(struct hl_device *hdev); void hl_mmu_fini(struct hl_device *hdev); int hl_mmu_ctx_init(struct hl_ctx *ctx); void hl_mmu_ctx_fini(struct hl_ctx *ctx); -int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size); -int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size); +int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, + u32 page_size, bool flush_pte); +int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, + bool flush_pte); void hl_mmu_swap_out(struct hl_ctx *ctx); void hl_mmu_swap_in(struct hl_ctx *ctx); diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c index 6c72cb4eff54..b612b1ad0aac 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/memory.c @@ -747,7 +747,8 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, for (i = 0 ; i < phys_pg_pack->npages ; i++) { paddr = phys_pg_pack->pages[i]; - rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size); + rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size, + (i + 1) == phys_pg_pack->npages); if (rc) { dev_err(hdev->dev, "map failed for handle %u, npages: %llu, mapped: %llu", @@ -765,7 +766,8 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, err: next_vaddr = vaddr; for (i = 0 ; i < mapped_pg_cnt ; i++) { - if (hl_mmu_unmap(ctx, next_vaddr, page_size)) + if (hl_mmu_unmap(ctx, next_vaddr, page_size, + (i + 1) == mapped_pg_cnt)) dev_warn_ratelimited(hdev->dev, "failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n", phys_pg_pack->handle, next_vaddr, @@ -794,7 +796,8 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, next_vaddr = vaddr; for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) { - if (hl_mmu_unmap(ctx, next_vaddr, page_size)) + if (hl_mmu_unmap(ctx, next_vaddr, page_size, + (i + 1) == phys_pg_pack->npages)) dev_warn_ratelimited(hdev->dev, "unmap failed for vaddr: 0x%llx\n", next_vaddr); diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c index 6262b26e2086..006eee47909d 100644 --- a/drivers/misc/habanalabs/mmu.c +++ b/drivers/misc/habanalabs/mmu.c @@ -637,29 +637,27 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr) clear_hop3 = true; if (!clear_hop3) - goto flush; + goto mapped; clear_pte(ctx, hop3_pte_addr); if (put_pte(ctx, hop3_addr)) - goto flush; + goto mapped; clear_pte(ctx, hop2_pte_addr); if (put_pte(ctx, hop2_addr)) - goto flush; + goto mapped; clear_pte(ctx, hop1_pte_addr); if (put_pte(ctx, hop1_addr)) - goto flush; + goto mapped; clear_pte(ctx, hop0_pte_addr); } -flush: - flush(ctx); - +mapped: return 0; not_mapped: @@ -675,6 +673,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr) * @ctx: pointer to the context structure * @virt_addr: virt addr to map from * @page_size: size of the page to unmap + * @flush_pte: whether to do a PCI flush * * This function does the following: * - Check that the virt addr is mapped @@ -685,15 +684,19 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr) * changes the MMU hash, it must be protected by a lock. * However, because it maps only a single page, the lock should be implemented * in a higher level in order to protect the entire mapping of the memory area + * + * For optimization reasons PCI flush may be requested once after unmapping of + * large area. */ -int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size) +int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, + bool flush_pte) { struct hl_device *hdev = ctx->hdev; struct asic_fixed_properties *prop = &hdev->asic_prop; struct hl_mmu_properties *mmu_prop; u64 real_virt_addr; u32 real_page_size, npages; - int i, rc; + int i, rc = 0; bool is_dram_addr; if (!hdev->mmu_enable) @@ -729,12 +732,15 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size) for (i = 0 ; i < npages ; i++) { rc = _hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr); if (rc) - return rc; + break; real_virt_addr += real_page_size; } - return 0; + if (flush_pte) + flush(ctx); + + return rc; } static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, @@ -885,8 +891,6 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, get_pte(ctx, hop3_addr); } - flush(ctx); - return 0; err: @@ -909,6 +913,7 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, * @virt_addr: virt addr to map from * @phys_addr: phys addr to map to * @page_size: physical page size + * @flush_pte: whether to do a PCI flush * * This function does the following: * - Check that the virt addr is not mapped @@ -919,8 +924,12 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, * changes the MMU hash, it must be protected by a lock. * However, because it maps only a single page, the lock should be implemented * in a higher level in order to protect the entire mapping of the memory area + * + * For optimization reasons PCI flush may be requested once after mapping of + * large area. */ -int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size) +int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size, + bool flush_pte) { struct hl_device *hdev = ctx->hdev; struct asic_fixed_properties *prop = &hdev->asic_prop; @@ -976,6 +985,9 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size) mapped_cnt++; } + if (flush_pte) + flush(ctx); + return 0; err: @@ -988,6 +1000,8 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size) real_virt_addr += real_page_size; } + flush(ctx); + return rc; }