md/raid6: implement recovery using ARM NEON intrinsics

Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2024-11-24 07:00:52 +07:00 · 2017-07-13 18:16:01 +01:00 · 2017-07-13 18:16:01 +01:00 · 6ec4e2514d
commit 6ec4e2514d
parent 35129dde88
5 changed files with 234 additions and 1 deletions
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@ -121,6 +121,7 @@ extern const struct raid6_recov_calls raid6_recov_ssse3;
 extern const struct raid6_recov_calls raid6_recov_avx2;
 extern const struct raid6_recov_calls raid6_recov_avx512;
 extern const struct raid6_recov_calls raid6_recov_s390xc;
 extern const struct raid6_recov_calls raid6_recov_neon;
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@ -5,7 +5,7 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
-raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
+raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding
 ifeq ($(ARCH),arm)
 NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
 endif
 CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
 ifeq ($(ARCH),arm64)
 CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@ -112,6 +112,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #endif
 #ifdef CONFIG_S390
 	&raid6_recov_s390xc,
 #endif
 #if defined(CONFIG_KERNEL_MODE_NEON)
 	&raid6_recov_neon,
 #endif
 	&raid6_recov_intx1,
 	NULL
--- a/lib/raid6/recov_neon.c
+++ b/lib/raid6/recov_neon.c
@ -0,0 +1,110 @@
 /*
 * Copyright (C) 2012 Intel Corporation
 * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 */
 #include <linux/raid/pq.h>
 #ifdef __KERNEL__
 #include <asm/neon.h>
 #else
 #define kernel_neon_begin()
 #define kernel_neon_end()
 #define cpu_has_neon()		(1)
 #endif
 static int raid6_has_neon(void)
 {
 	return cpu_has_neon();
 }
 void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
 			      uint8_t *dq, const uint8_t *pbmul,
 			      const uint8_t *qmul);
 void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
 			      const uint8_t *qmul);
 static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
 		int failb, void **ptrs)
 {
 	u8 *p, *q, *dp, *dq;
 	const u8 *pbmul;	/* P multiplier table for B data */
 	const u8 *qmul;		/* Q multiplier table (for both) */
 	p = (u8 *)ptrs[disks - 2];
 	q = (u8 *)ptrs[disks - 1];
 	/*
 	 * Compute syndrome with zero for the missing data pages
 	 * Use the dead data pages as temporary storage for
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
 	ptrs[faila] = (void *)raid6_empty_zero_page;
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
 	ptrs[failb] = (void *)raid6_empty_zero_page;
 	ptrs[disks - 1] = dq;
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
 	/* Restore pointer table */
 	ptrs[faila]     = dp;
 	ptrs[failb]     = dq;
 	ptrs[disks - 2] = p;
 	ptrs[disks - 1] = q;
 	/* Now, pick the proper data tables */
 	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
 					 raid6_gfexp[failb]]];
 	kernel_neon_begin();
 	__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
 	kernel_neon_end();
 }
 static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
 		void **ptrs)
 {
 	u8 *p, *q, *dq;
 	const u8 *qmul;		/* Q multiplier table */
 	p = (u8 *)ptrs[disks - 2];
 	q = (u8 *)ptrs[disks - 1];
 	/*
 	 * Compute syndrome with zero for the missing data page
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
 	ptrs[faila] = (void *)raid6_empty_zero_page;
 	ptrs[disks - 1] = dq;
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
 	/* Restore pointer table */
 	ptrs[faila]     = dq;
 	ptrs[disks - 1] = q;
 	/* Now, pick the proper data tables */
 	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
 	kernel_neon_begin();
 	__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
 	kernel_neon_end();
 }
 const struct raid6_recov_calls raid6_recov_neon = {
 	.data2		= raid6_2data_recov_neon,
 	.datap		= raid6_datap_recov_neon,
 	.valid		= raid6_has_neon,
 	.name		= "neon",
 	.priority	= 10,
 };
--- a/lib/raid6/recov_neon_inner.c
+++ b/lib/raid6/recov_neon_inner.c
@ -0,0 +1,117 @@
 /*
 * Copyright (C) 2012 Intel Corporation
 * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 */
 #include <arm_neon.h>
 static const uint8x16_t x0f = {
 	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
 	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
 };
 #ifdef CONFIG_ARM
 /*
 * AArch32 does not provide this intrinsic natively because it does not
 * implement the underlying instruction. AArch32 only provides a 64-bit
 * wide vtbl.8 instruction, so use that instead.
 */
 static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 {
 	union {
 		uint8x16_t	val;
 		uint8x8x2_t	pair;
 	} __a = { a };
 	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
 			   vtbl2_u8(__a.pair, vget_high_u8(b)));
 }
 #endif
 void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
 			      uint8_t *dq, const uint8_t *pbmul,
 			      const uint8_t *qmul)
 {
 	uint8x16_t pm0 = vld1q_u8(pbmul);
 	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
 	uint8x16_t qm0 = vld1q_u8(qmul);
 	uint8x16_t qm1 = vld1q_u8(qmul + 16);
 	/*
 	 * while ( bytes-- ) {
 	 *	uint8_t px, qx, db;
 	 *
 	 *	px    = *p ^ *dp;
 	 *	qx    = qmul[*q ^ *dq];
 	 *	*dq++ = db = pbmul[px] ^ qx;
 	 *	*dp++ = db ^ px;
 	 *	p++; q++;
 	 * }
 	 */
 	while (bytes) {
 		uint8x16_t vx, vy, px, qx, db;
 		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
 		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
 		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
 		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
 		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
 		qx = veorq_u8(vx, vy);
 		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
 		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
 		vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
 		vx = veorq_u8(vx, vy);
 		db = veorq_u8(vx, qx);
 		vst1q_u8(dq, db);
 		vst1q_u8(dp, veorq_u8(db, px));
 		bytes -= 16;
 		p += 16;
 		q += 16;
 		dp += 16;
 		dq += 16;
 	}
 }
 void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
 			      const uint8_t *qmul)
 {
 	uint8x16_t qm0 = vld1q_u8(qmul);
 	uint8x16_t qm1 = vld1q_u8(qmul + 16);
 	/*
 	 * while (bytes--) {
 	 *	*p++ ^= *dq = qmul[*q ^ *dq];
 	 *	q++; dq++;
 	 * }
 	 */
 	while (bytes) {
 		uint8x16_t vx, vy;
 		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
 		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
 		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
 		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
 		vx = veorq_u8(vx, vy);
 		vy = veorq_u8(vx, vld1q_u8(p));
 		vst1q_u8(dq, vx);
 		vst1q_u8(p, vy);
 		bytes -= 16;
 		p += 16;
 		q += 16;
 		dq += 16;
 	}
 }