mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-23 11:06:48 +07:00
73e592f3bc
The tmp variable is used twice: first to pose as a register containing a value of zero, and then to provide a temporary register that initially is zero and get added some value. But somehow gcc decides to split those two usages in different registers. Example code: u64 div64const1000(u64 x) { u32 y = 1000; do_div(x, y); return x; } Result: div64const1000: push {r4, r5, r6, r7, lr} mov lr, #0 mov r6, r0 mov r7, r1 adr r5, .L8 ldrd r4, [r5] mov r1, lr umull r2, r3, r4, r6 cmn r2, r4 adcs r3, r3, r5 adc r2, lr, #0 umlal r3, r2, r5, r6 umlal r3, r1, r4, r7 mov r3, #0 adds r2, r1, r2 adc r3, r3, #0 umlal r2, r3, r5, r7 lsr r0, r2, #9 lsr r1, r3, #9 orr r0, r0, r3, lsl #23 pop {r4, r5, r6, r7, pc} .align 3 .L8: .word -1924145349 .word -2095944041 Full kernel build size: text data bss dec hex filename 13663814 1553940 351368 15569122 ed90e2 vmlinux Here the two instances of 'tmp' are assigned to r1 and lr. To avoid that, let's mark the first 'tmp' usage in __arch_xprod_64() with a "+r" constraint even if the register is not written to, so to create a dependency for the second usage with the effect of enforcing a single temporary register throughout. Result: div64const1000: push {r4, r5, r6, r7} movs r3, #0 adr r5, .L8 ldrd r4, [r5] umull r6, r7, r4, r0 cmn r6, r4 adcs r7, r7, r5 adc r6, r3, #0 umlal r7, r6, r5, r0 umlal r7, r3, r4, r1 mov r7, #0 adds r6, r3, r6 adc r7, r7, #0 umlal r6, r7, r5, r1 lsr r0, r6, #9 lsr r1, r7, #9 orr r0, r0, r7, lsl #23 pop {r4, r5, r6, r7} bx lr .align 3 .L8: .word -1924145349 .word -2095944041 text data bss dec hex filename 13663438 1553940 351368 15568746 ed8f6a vmlinux This time 'tmp' is assigned to r3 and used throughout. However, by being assigned to r3, that blocks usage of the r2-r3 double register slot for 64-bit values, forcing more registers to be spilled on the stack. Let's try to help it by forcing 'tmp' to the caller-saved ip register. Result: div64const1000: stmfd sp!, {r4, r5} mov ip, #0 adr r5, .L8 ldrd r4, [r5] umull r2, r3, r4, r0 cmn r2, r4 adcs r3, r3, r5 adc r2, ip, #0 umlal r3, r2, r5, r0 umlal r3, ip, r4, r1 mov r3, #0 adds r2, ip, r2 adc r3, r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #9 mov r1, r3, lsr #9 orr r0, r0, r3, asl #23 ldmfd sp!, {r4, r5} bx lr .align 3 .L8: .word -1924145349 .word -2095944041 text data bss dec hex filename 13662838 1553940 351368 15568146 ed8d12 vmlinux We could make the code marginally smaller yet by forcing 'tmp' to lr instead, but that would have a negative inpact on branch prediction for which "bx lr" is optimal. Signed-off-by: Nicolas Pitre <nico@linaro.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
131 lines
3.1 KiB
C
131 lines
3.1 KiB
C
#ifndef __ASM_ARM_DIV64
|
|
#define __ASM_ARM_DIV64
|
|
|
|
#include <linux/types.h>
|
|
#include <asm/compiler.h>
|
|
|
|
/*
|
|
* The semantics of __div64_32() are:
|
|
*
|
|
* uint32_t __div64_32(uint64_t *n, uint32_t base)
|
|
* {
|
|
* uint32_t remainder = *n % base;
|
|
* *n = *n / base;
|
|
* return remainder;
|
|
* }
|
|
*
|
|
* In other words, a 64-bit dividend with a 32-bit divisor producing
|
|
* a 64-bit result and a 32-bit remainder. To accomplish this optimally
|
|
* we override the generic version in lib/div64.c to call our __do_div64
|
|
* assembly implementation with completely non standard calling convention
|
|
* for arguments and results (beware).
|
|
*/
|
|
|
|
#ifdef __ARMEB__
|
|
#define __xh "r0"
|
|
#define __xl "r1"
|
|
#else
|
|
#define __xl "r0"
|
|
#define __xh "r1"
|
|
#endif
|
|
|
|
static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
|
|
{
|
|
register unsigned int __base asm("r4") = base;
|
|
register unsigned long long __n asm("r0") = *n;
|
|
register unsigned long long __res asm("r2");
|
|
register unsigned int __rem asm(__xh);
|
|
asm( __asmeq("%0", __xh)
|
|
__asmeq("%1", "r2")
|
|
__asmeq("%2", "r0")
|
|
__asmeq("%3", "r4")
|
|
"bl __do_div64"
|
|
: "=r" (__rem), "=r" (__res)
|
|
: "r" (__n), "r" (__base)
|
|
: "ip", "lr", "cc");
|
|
*n = __res;
|
|
return __rem;
|
|
}
|
|
#define __div64_32 __div64_32
|
|
|
|
#if !defined(CONFIG_AEABI)
|
|
|
|
/*
|
|
* In OABI configurations, some uses of the do_div function
|
|
* cause gcc to run out of registers. To work around that,
|
|
* we can force the use of the out-of-line version for
|
|
* configurations that build a OABI kernel.
|
|
*/
|
|
#define do_div(n, base) __div64_32(&(n), base)
|
|
|
|
#else
|
|
|
|
/*
|
|
* gcc versions earlier than 4.0 are simply too problematic for the
|
|
* __div64_const32() code in asm-generic/div64.h. First there is
|
|
* gcc PR 15089 that tend to trig on more complex constructs, spurious
|
|
* .global __udivsi3 are inserted even if none of those symbols are
|
|
* referenced in the generated code, and those gcc versions are not able
|
|
* to do constant propagation on long long values anyway.
|
|
*/
|
|
|
|
#define __div64_const32_is_OK (__GNUC__ >= 4)
|
|
|
|
static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
|
|
{
|
|
unsigned long long res;
|
|
register unsigned int tmp asm("ip") = 0;
|
|
|
|
if (!bias) {
|
|
asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"
|
|
"mov %Q0, #0"
|
|
: "=&r" (res)
|
|
: "r" (m), "r" (n)
|
|
: "cc");
|
|
} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
|
|
res = m;
|
|
asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t"
|
|
"mov %Q0, #0"
|
|
: "+&r" (res)
|
|
: "r" (m), "r" (n)
|
|
: "cc");
|
|
} else {
|
|
asm ( "umull %Q0, %R0, %Q2, %Q3\n\t"
|
|
"cmn %Q0, %Q2\n\t"
|
|
"adcs %R0, %R0, %R2\n\t"
|
|
"adc %Q0, %1, #0"
|
|
: "=&r" (res), "+&r" (tmp)
|
|
: "r" (m), "r" (n)
|
|
: "cc");
|
|
}
|
|
|
|
if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
|
|
asm ( "umlal %R0, %Q0, %R1, %Q2\n\t"
|
|
"umlal %R0, %Q0, %Q1, %R2\n\t"
|
|
"mov %R0, #0\n\t"
|
|
"umlal %Q0, %R0, %R1, %R2"
|
|
: "+&r" (res)
|
|
: "r" (m), "r" (n)
|
|
: "cc");
|
|
} else {
|
|
asm ( "umlal %R0, %Q0, %R2, %Q3\n\t"
|
|
"umlal %R0, %1, %Q2, %R3\n\t"
|
|
"mov %R0, #0\n\t"
|
|
"adds %Q0, %1, %Q0\n\t"
|
|
"adc %R0, %R0, #0\n\t"
|
|
"umlal %Q0, %R0, %R2, %R3"
|
|
: "+&r" (res), "+&r" (tmp)
|
|
: "r" (m), "r" (n)
|
|
: "cc");
|
|
}
|
|
|
|
return res;
|
|
}
|
|
#define __arch_xprod_64 __arch_xprod_64
|
|
|
|
#include <asm-generic/div64.h>
|
|
|
|
#endif
|
|
|
|
#endif
|