linux_dsm_epyc7002/arch/arm/include/asm/div64.h
Nicolas Pitre 73e592f3bc ARM: 8504/1: __arch_xprod_64(): small optimization
The tmp variable is used twice: first to pose as a register containing
a value of zero, and then to provide a temporary register that initially
is zero and get added some value. But somehow gcc decides to split those
two usages in different registers.

Example code:

u64 div64const1000(u64 x)
{
	u32 y = 1000;
	do_div(x, y);
	return x;
}

Result:

div64const1000:
	push	{r4, r5, r6, r7, lr}
	mov	lr, #0
	mov	r6, r0
	mov	r7, r1
	adr	r5, .L8
	ldrd	r4, [r5]
	mov	r1, lr
	umull	r2, r3, r4, r6
	cmn	r2, r4
	adcs	r3, r3, r5
	adc	r2, lr, #0
	umlal	r3, r2, r5, r6
	umlal	r3, r1, r4, r7
	mov	r3, #0
	adds	r2, r1, r2
	adc	r3, r3, #0
	umlal	r2, r3, r5, r7
	lsr	r0, r2, #9
	lsr	r1, r3, #9
	orr	r0, r0, r3, lsl #23
	pop	{r4, r5, r6, r7, pc}
	.align	3
.L8:
	.word	-1924145349
	.word	-2095944041

Full kernel build size:

   text	   data	    bss	    dec	    hex	filename
13663814	1553940	 351368	15569122	 ed90e2	vmlinux

Here the two instances of 'tmp' are assigned to r1 and lr.

To avoid that, let's mark the first 'tmp' usage in __arch_xprod_64()
with a "+r" constraint even if the register is not written to, so to
create a dependency for the second usage with the effect of enforcing
a single temporary register throughout.

Result:

div64const1000:
	push	{r4, r5, r6, r7}
	movs	r3, #0
	adr	r5, .L8
	ldrd	r4, [r5]
	umull	r6, r7, r4, r0
	cmn	r6, r4
	adcs	r7, r7, r5
	adc	r6, r3, #0
	umlal	r7, r6, r5, r0
	umlal	r7, r3, r4, r1
	mov	r7, #0
	adds	r6, r3, r6
	adc	r7, r7, #0
	umlal	r6, r7, r5, r1
	lsr	r0, r6, #9
	lsr	r1, r7, #9
	orr	r0, r0, r7, lsl #23
	pop	{r4, r5, r6, r7}
	bx	lr
	.align	3
.L8:
	.word	-1924145349
	.word	-2095944041

   text	   data	    bss	    dec	    hex	filename
13663438	1553940	 351368	15568746	 ed8f6a	vmlinux

This time 'tmp' is assigned to r3 and used throughout. However, by being
assigned to r3, that blocks usage of the r2-r3 double register slot for
64-bit values, forcing more registers to be spilled on the stack. Let's
try to help it by forcing 'tmp' to the caller-saved ip register.

Result:

div64const1000:
	stmfd	sp!, {r4, r5}
	mov	ip, #0
	adr	r5, .L8
	ldrd	r4, [r5]
	umull	r2, r3, r4, r0
	cmn	r2, r4
	adcs	r3, r3, r5
	adc	r2, ip, #0
	umlal	r3, r2, r5, r0
	umlal	r3, ip, r4, r1
	mov	r3, #0
	adds	r2, ip, r2
	adc	r3, r3, #0
	umlal	r2, r3, r5, r1
	mov	r0, r2, lsr #9
	mov	r1, r3, lsr #9
	orr	r0, r0, r3, asl #23
	ldmfd	sp!, {r4, r5}
	bx	lr
	.align	3
.L8:
	.word	-1924145349
	.word	-2095944041

   text	   data	    bss	    dec	    hex	filename
13662838	1553940	 351368	15568146	 ed8d12	vmlinux

We could make the code marginally smaller yet by forcing 'tmp' to lr
instead, but that would have a negative inpact on branch prediction for
which "bx lr" is optimal.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2016-02-11 15:33:37 +00:00

131 lines
3.1 KiB
C

#ifndef __ASM_ARM_DIV64
#define __ASM_ARM_DIV64
#include <linux/types.h>
#include <asm/compiler.h>
/*
* The semantics of __div64_32() are:
*
* uint32_t __div64_32(uint64_t *n, uint32_t base)
* {
* uint32_t remainder = *n % base;
* *n = *n / base;
* return remainder;
* }
*
* In other words, a 64-bit dividend with a 32-bit divisor producing
* a 64-bit result and a 32-bit remainder. To accomplish this optimally
* we override the generic version in lib/div64.c to call our __do_div64
* assembly implementation with completely non standard calling convention
* for arguments and results (beware).
*/
#ifdef __ARMEB__
#define __xh "r0"
#define __xl "r1"
#else
#define __xl "r0"
#define __xh "r1"
#endif
static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
{
register unsigned int __base asm("r4") = base;
register unsigned long long __n asm("r0") = *n;
register unsigned long long __res asm("r2");
register unsigned int __rem asm(__xh);
asm( __asmeq("%0", __xh)
__asmeq("%1", "r2")
__asmeq("%2", "r0")
__asmeq("%3", "r4")
"bl __do_div64"
: "=r" (__rem), "=r" (__res)
: "r" (__n), "r" (__base)
: "ip", "lr", "cc");
*n = __res;
return __rem;
}
#define __div64_32 __div64_32
#if !defined(CONFIG_AEABI)
/*
* In OABI configurations, some uses of the do_div function
* cause gcc to run out of registers. To work around that,
* we can force the use of the out-of-line version for
* configurations that build a OABI kernel.
*/
#define do_div(n, base) __div64_32(&(n), base)
#else
/*
* gcc versions earlier than 4.0 are simply too problematic for the
* __div64_const32() code in asm-generic/div64.h. First there is
* gcc PR 15089 that tend to trig on more complex constructs, spurious
* .global __udivsi3 are inserted even if none of those symbols are
* referenced in the generated code, and those gcc versions are not able
* to do constant propagation on long long values anyway.
*/
#define __div64_const32_is_OK (__GNUC__ >= 4)
static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
{
unsigned long long res;
register unsigned int tmp asm("ip") = 0;
if (!bias) {
asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"
"mov %Q0, #0"
: "=&r" (res)
: "r" (m), "r" (n)
: "cc");
} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
res = m;
asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t"
"mov %Q0, #0"
: "+&r" (res)
: "r" (m), "r" (n)
: "cc");
} else {
asm ( "umull %Q0, %R0, %Q2, %Q3\n\t"
"cmn %Q0, %Q2\n\t"
"adcs %R0, %R0, %R2\n\t"
"adc %Q0, %1, #0"
: "=&r" (res), "+&r" (tmp)
: "r" (m), "r" (n)
: "cc");
}
if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
asm ( "umlal %R0, %Q0, %R1, %Q2\n\t"
"umlal %R0, %Q0, %Q1, %R2\n\t"
"mov %R0, #0\n\t"
"umlal %Q0, %R0, %R1, %R2"
: "+&r" (res)
: "r" (m), "r" (n)
: "cc");
} else {
asm ( "umlal %R0, %Q0, %R2, %Q3\n\t"
"umlal %R0, %1, %Q2, %R3\n\t"
"mov %R0, #0\n\t"
"adds %Q0, %1, %Q0\n\t"
"adc %R0, %R0, #0\n\t"
"umlal %Q0, %R0, %R2, %R3"
: "+&r" (res), "+&r" (tmp)
: "r" (m), "r" (n)
: "cc");
}
return res;
}
#define __arch_xprod_64 __arch_xprod_64
#include <asm-generic/div64.h>
#endif
#endif