mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-14 01:46:21 +07:00
5777eaed56
Apparently there exist certain workloads which rely heavily on software checksumming, for which the generic do_csum() implementation becomes a significant bottleneck. Therefore let's give arm64 its own optimised version - for ease of maintenance this foregoes assembly or intrisics, and is thus not actually arm64-specific, but does rely heavily on C idioms that translate well to the A64 ISA and the typical load/store capabilities of most ARMv8 CPU cores. The resulting increase in checksum throughput scales nicely with buffer size, tending towards 4x for a small in-order core (Cortex-A53), and up to 6x or more for an aggressive big core (Ampere eMAG). Reported-by: Lingyan Huang <huanglingyan2@huawei.com> Tested-by: Lingyan Huang <huanglingyan2@huawei.com> Signed-off-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Will Deacon <will@kernel.org>
19 lines
613 B
Makefile
19 lines
613 B
Makefile
# SPDX-License-Identifier: GPL-2.0
|
|
lib-y := clear_user.o delay.o copy_from_user.o \
|
|
copy_to_user.o copy_in_user.o copy_page.o \
|
|
clear_page.o csum.o memchr.o memcpy.o memmove.o \
|
|
memset.o memcmp.o strcmp.o strncmp.o strlen.o \
|
|
strnlen.o strchr.o strrchr.o tishift.o
|
|
|
|
ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
|
|
obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
|
|
CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only
|
|
CFLAGS_xor-neon.o += -ffreestanding
|
|
endif
|
|
|
|
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
|
|
|
|
obj-$(CONFIG_CRC32) += crc32.o
|
|
|
|
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
|