linux_dsm_epyc7002/arch/hexagon/lib/memcpy.S
Thomas Gleixner 08dbd0f8ef treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 267
Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license version 2 and
  only version 2 as published by the free software foundation this
  program is distributed in the hope that it will be useful but
  without any warranty without even the implied warranty of
  merchantability or fitness for a particular purpose see the gnu
  general public license for more details you should have received a
  copy of the gnu general public license along with this program if
  not write to the free software foundation inc 51 franklin street
  fifth floor boston ma 02110 1301 usa

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 94 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Richard Fontana <rfontana@redhat.com>
Reviewed-by: Alexios Zavras <alexios.zavras@intel.com>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190529141334.043630402@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-06-05 17:30:29 +02:00

530 lines
15 KiB
ArmAsm

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
*/
/*
* Description
*
* library function for memcpy where length bytes are copied from
* ptr_in to ptr_out. ptr_out is returned unchanged.
* Allows any combination of alignment on input and output pointers
* and length from 0 to 2^32-1
*
* Restrictions
* The arrays should not overlap, the program will produce undefined output
* if they do.
* For blocks less than 16 bytes a byte by byte copy is performed. For
* 8byte alignments, and length multiples, a dword copy is performed up to
* 96bytes
* History
*
* DJH 5/15/09 Initial version 1.0
* DJH 6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19
* DJH 7/12/09 Version 1.2 optimized codesize down to 760 was 840
* DJH 10/14/09 Version 1.3 added special loop for aligned case, was
* overreading bloated codesize back up to 892
* DJH 4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads
* occurring if only 1 left outstanding, fixes bug
* # 3888, corrected for all alignments. Peeled off
* 1 32byte chunk from kernel loop and extended 8byte
* loop at end to solve all combinations and prevent
* over read. Fixed Ldword_loop_prolog to prevent
* overread for blocks less than 48bytes. Reduced
* codesize to 752 bytes
* DJH 4/21/10 version 1.5 1.4 fix broke code for input block ends not
* aligned to dword boundaries,underwriting by 1
* byte, added detection for this and fixed. A
* little bloat.
* DJH 4/23/10 version 1.6 corrected stack error, R20 was not being restored
* always, fixed the error of R20 being modified
* before it was being saved
* Natural c model
* ===============
* void * memcpy(char * ptr_out, char * ptr_in, int length) {
* int i;
* if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; }
* return(ptr_out);
* }
*
* Optimized memcpy function
* =========================
* void * memcpy(char * ptr_out, char * ptr_in, int len) {
* int i, prolog, kernel, epilog, mask;
* u8 offset;
* s64 data0, dataF8, data70;
*
* s64 * ptr8_in;
* s64 * ptr8_out;
* s32 * ptr4;
* s16 * ptr2;
*
* offset = ((int) ptr_in) & 7;
* ptr8_in = (s64 *) &ptr_in[-offset]; //read in the aligned pointers
*
* data70 = *ptr8_in++;
* dataF8 = *ptr8_in++;
*
* data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
*
* prolog = 32 - ((int) ptr_out);
* mask = 0x7fffffff >> HEXAGON_R_cl0_R(len);
* prolog = prolog & mask;
* kernel = len - prolog;
* epilog = kernel & 0x1F;
* kernel = kernel>>5;
*
* if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;}
* ptr2 = (s16 *) &ptr_out[0];
* if (prolog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;}
* ptr4 = (s32 *) &ptr_out[0];
* if (prolog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;}
*
* offset = offset + (prolog & 7);
* if (offset >= 8) {
* data70 = dataF8;
* dataF8 = *ptr8_in++;
* }
* offset = offset & 0x7;
*
* prolog = prolog >> 3;
* if (prolog) for (i=0; i < prolog; i++) {
* data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
* ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
* data70 = dataF8;
* dataF8 = *ptr8_in++;
* }
* if(kernel) { kernel -= 1; epilog += 32; }
* if(kernel) for(i=0; i < kernel; i++) {
* data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
* ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
* data70 = *ptr8_in++;
*
* data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
* ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
* dataF8 = *ptr8_in++;
*
* data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
* ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
* data70 = *ptr8_in++;
*
* data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
* ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
* dataF8 = *ptr8_in++;
* }
* epilogdws = epilog >> 3;
* if (epilogdws) for (i=0; i < epilogdws; i++) {
* data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
* ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
* data70 = dataF8;
* dataF8 = *ptr8_in++;
* }
* data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
*
* ptr4 = (s32 *) &ptr_out[0];
* if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;}
* ptr2 = (s16 *) &ptr_out[0];
* if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;}
* if (epilog & 1) { *ptr_out++ = (u8) data0; }
*
* return(ptr_out - length);
* }
*
* Codesize : 784 bytes
*/
#define ptr_out R0 /* destination pounter */
#define ptr_in R1 /* source pointer */
#define len R2 /* length of copy in bytes */
#define data70 R13:12 /* lo 8 bytes of non-aligned transfer */
#define dataF8 R11:10 /* hi 8 bytes of non-aligned transfer */
#define ldata0 R7:6 /* even 8 bytes chunks */
#define ldata1 R25:24 /* odd 8 bytes chunks */
#define data1 R7 /* lower 8 bytes of ldata1 */
#define data0 R6 /* lower 8 bytes of ldata0 */
#define ifbyte p0 /* if transfer has bytes in epilog/prolog */
#define ifhword p0 /* if transfer has shorts in epilog/prolog */
#define ifword p0 /* if transfer has words in epilog/prolog */
#define noprolog p0 /* no prolog, xfer starts at 32byte */
#define nokernel p1 /* no 32byte multiple block in the transfer */
#define noepilog p0 /* no epilog, xfer ends on 32byte boundary */
#define align p2 /* alignment of input rel to 8byte boundary */
#define kernel1 p0 /* kernel count == 1 */
#define dalign R25 /* rel alignment of input to output data */
#define star3 R16 /* number bytes in prolog - dwords */
#define rest R8 /* length - prolog bytes */
#define back R7 /* nr bytes > dword boundary in src block */
#define epilog R3 /* bytes in epilog */
#define inc R15:14 /* inc kernel by -1 and defetch ptr by 32 */
#define kernel R4 /* number of 32byte chunks in kernel */
#define ptr_in_p_128 R5 /* pointer for prefetch of input data */
#define mask R8 /* mask used to determine prolog size */
#define shift R8 /* used to work a shifter to extract bytes */
#define shift2 R5 /* in epilog to workshifter to extract bytes */
#define prolog R15 /* bytes in prolog */
#define epilogdws R15 /* number dwords in epilog */
#define shiftb R14 /* used to extract bytes */
#define offset R9 /* same as align in reg */
#define ptr_out_p_32 R17 /* pointer to output dczero */
#define align888 R14 /* if simple dword loop can be used */
#define len8 R9 /* number of dwords in length */
#define over R20 /* nr of bytes > last inp buf dword boundary */
#define ptr_in_p_128kernel R5:4 /* packed fetch pointer & kernel cnt */
.section .text
.p2align 4
.global memcpy
.type memcpy, @function
memcpy:
{
p2 = cmp.eq(len, #0); /* =0 */
align888 = or(ptr_in, ptr_out); /* %8 < 97 */
p0 = cmp.gtu(len, #23); /* %1, <24 */
p1 = cmp.eq(ptr_in, ptr_out); /* attempt to overwrite self */
}
{
p1 = or(p2, p1);
p3 = cmp.gtu(len, #95); /* %8 < 97 */
align888 = or(align888, len); /* %8 < 97 */
len8 = lsr(len, #3); /* %8 < 97 */
}
{
dcfetch(ptr_in); /* zero/ptrin=ptrout causes fetch */
p2 = bitsclr(align888, #7); /* %8 < 97 */
if(p1) jumpr r31; /* =0 */
}
{
p2 = and(p2,!p3); /* %8 < 97 */
if (p2.new) len = add(len, #-8); /* %8 < 97 */
if (p2.new) jump:NT .Ldwordaligned; /* %8 < 97 */
}
{
if(!p0) jump .Lbytes23orless; /* %1, <24 */
mask.l = #LO(0x7fffffff);
/* all bytes before line multiples of data */
prolog = sub(#0, ptr_out);
}
{
/* save r31 on stack, decrement sp by 16 */
allocframe(#24);
mask.h = #HI(0x7fffffff);
ptr_in_p_128 = add(ptr_in, #32);
back = cl0(len);
}
{
memd(sp+#0) = R17:16; /* save r16,r17 on stack6 */
r31.l = #LO(.Lmemcpy_return); /* set up final return pointer */
prolog &= lsr(mask, back);
offset = and(ptr_in, #7);
}
{
memd(sp+#8) = R25:24; /* save r25,r24 on stack */
dalign = sub(ptr_out, ptr_in);
r31.h = #HI(.Lmemcpy_return); /* set up final return pointer */
}
{
/* see if there if input buffer end if aligned */
over = add(len, ptr_in);
back = add(len, offset);
memd(sp+#16) = R21:20; /* save r20,r21 on stack */
}
{
noprolog = bitsclr(prolog, #7);
prolog = and(prolog, #31);
dcfetch(ptr_in_p_128);
ptr_in_p_128 = add(ptr_in_p_128, #32);
}
{
kernel = sub(len, prolog);
shift = asl(prolog, #3);
star3 = and(prolog, #7);
ptr_in = and(ptr_in, #-8);
}
{
prolog = lsr(prolog, #3);
epilog = and(kernel, #31);
ptr_out_p_32 = add(ptr_out, prolog);
over = and(over, #7);
}
{
p3 = cmp.gtu(back, #8);
kernel = lsr(kernel, #5);
dcfetch(ptr_in_p_128);
ptr_in_p_128 = add(ptr_in_p_128, #32);
}
{
p1 = cmp.eq(prolog, #0);
if(!p1.new) prolog = add(prolog, #1);
dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */
ptr_in_p_128 = add(ptr_in_p_128, #32);
}
{
nokernel = cmp.eq(kernel,#0);
dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */
ptr_in_p_128 = add(ptr_in_p_128, #32);
shiftb = and(shift, #8);
}
{
dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */
ptr_in_p_128 = add(ptr_in_p_128, #32);
if(nokernel) jump .Lskip64;
p2 = cmp.eq(kernel, #1); /* skip ovr if kernel == 0 */
}
{
dczeroa(ptr_out_p_32);
/* don't advance pointer */
if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32);
}
{
dalign = and(dalign, #31);
dczeroa(ptr_out_p_32);
}
.Lskip64:
{
data70 = memd(ptr_in++#16);
if(p3) dataF8 = memd(ptr_in+#8);
if(noprolog) jump .Lnoprolog32;
align = offset;
}
/* upto initial 7 bytes */
{
ldata0 = valignb(dataF8, data70, align);
ifbyte = tstbit(shift,#3);
offset = add(offset, star3);
}
{
if(ifbyte) memb(ptr_out++#1) = data0;
ldata0 = lsr(ldata0, shiftb);
shiftb = and(shift, #16);
ifhword = tstbit(shift,#4);
}
{
if(ifhword) memh(ptr_out++#2) = data0;
ldata0 = lsr(ldata0, shiftb);
ifword = tstbit(shift,#5);
p2 = cmp.gtu(offset, #7);
}
{
if(ifword) memw(ptr_out++#4) = data0;
if(p2) data70 = dataF8;
if(p2) dataF8 = memd(ptr_in++#8); /* another 8 bytes */
align = offset;
}
.Lnoprolog32:
{
p3 = sp1loop0(.Ldword_loop_prolog, prolog)
rest = sub(len, star3); /* whats left after the loop */
p0 = cmp.gt(over, #0);
}
if(p0) rest = add(rest, #16);
.Ldword_loop_prolog:
{
if(p3) memd(ptr_out++#8) = ldata0;
ldata0 = valignb(dataF8, data70, align);
p0 = cmp.gt(rest, #16);
}
{
data70 = dataF8;
if(p0) dataF8 = memd(ptr_in++#8);
rest = add(rest, #-8);
}:endloop0
.Lkernel:
{
/* kernel is at least 32bytes */
p3 = cmp.gtu(kernel, #0);
/* last itn. remove edge effects */
if(p3.new) kernel = add(kernel, #-1);
/* dealt with in last dword loop */
if(p3.new) epilog = add(epilog, #32);
}
{
nokernel = cmp.eq(kernel, #0); /* after adjustment, recheck */
if(nokernel.new) jump:NT .Lepilog; /* likely not taken */
inc = combine(#32, #-1);
p3 = cmp.gtu(dalign, #24);
}
{
if(p3) jump .Lodd_alignment;
}
{
loop0(.Loword_loop_25to31, kernel);
kernel1 = cmp.gtu(kernel, #1);
rest = kernel;
}
.falign
.Loword_loop_25to31:
{
dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */
if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
}
{
dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */
p3 = cmp.eq(kernel, rest);
}
{
/* kernel -= 1 */
ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
/* kill write on first iteration */
if(!p3) memd(ptr_out++#8) = ldata1;
ldata1 = valignb(dataF8, data70, align);
data70 = memd(ptr_in++#8);
}
{
memd(ptr_out++#8) = ldata0;
ldata0 = valignb(data70, dataF8, align);
dataF8 = memd(ptr_in++#8);
}
{
memd(ptr_out++#8) = ldata1;
ldata1 = valignb(dataF8, data70, align);
data70 = memd(ptr_in++#8);
}
{
memd(ptr_out++#8) = ldata0;
ldata0 = valignb(data70, dataF8, align);
dataF8 = memd(ptr_in++#8);
kernel1 = cmp.gtu(kernel, #1);
}:endloop0
{
memd(ptr_out++#8) = ldata1;
jump .Lepilog;
}
.Lodd_alignment:
{
loop0(.Loword_loop_00to24, kernel);
kernel1 = cmp.gtu(kernel, #1);
rest = add(kernel, #-1);
}
.falign
.Loword_loop_00to24:
{
dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */
ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
}
{
dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */
}
{
memd(ptr_out++#8) = ldata0;
ldata0 = valignb(dataF8, data70, align);
data70 = memd(ptr_in++#8);
}
{
memd(ptr_out++#8) = ldata0;
ldata0 = valignb(data70, dataF8, align);
dataF8 = memd(ptr_in++#8);
}
{
memd(ptr_out++#8) = ldata0;
ldata0 = valignb(dataF8, data70, align);
data70 = memd(ptr_in++#8);
}
{
memd(ptr_out++#8) = ldata0;
ldata0 = valignb(data70, dataF8, align);
dataF8 = memd(ptr_in++#8);
kernel1 = cmp.gtu(kernel, #1);
}:endloop0
.Lepilog:
{
noepilog = cmp.eq(epilog,#0);
epilogdws = lsr(epilog, #3);
kernel = and(epilog, #7);
}
{
if(noepilog) jumpr r31;
if(noepilog) ptr_out = sub(ptr_out, len);
p3 = cmp.eq(epilogdws, #0);
shift2 = asl(epilog, #3);
}
{
shiftb = and(shift2, #32);
ifword = tstbit(epilog,#2);
if(p3) jump .Lepilog60;
if(!p3) epilog = add(epilog, #-16);
}
{
loop0(.Ldword_loop_epilog, epilogdws);
/* stop criteria is lsbs unless = 0 then its 8 */
p3 = cmp.eq(kernel, #0);
if(p3.new) kernel= #8;
p1 = cmp.gt(over, #0);
}
/* if not aligned to end of buffer execute 1 more iteration */
if(p1) kernel= #0;
.Ldword_loop_epilog:
{
memd(ptr_out++#8) = ldata0;
ldata0 = valignb(dataF8, data70, align);
p3 = cmp.gt(epilog, kernel);
}
{
data70 = dataF8;
if(p3) dataF8 = memd(ptr_in++#8);
epilog = add(epilog, #-8);
}:endloop0
/* copy last 7 bytes */
.Lepilog60:
{
if(ifword) memw(ptr_out++#4) = data0;
ldata0 = lsr(ldata0, shiftb);
ifhword = tstbit(epilog,#1);
shiftb = and(shift2, #16);
}
{
if(ifhword) memh(ptr_out++#2) = data0;
ldata0 = lsr(ldata0, shiftb);
ifbyte = tstbit(epilog,#0);
if(ifbyte.new) len = add(len, #-1);
}
{
if(ifbyte) memb(ptr_out) = data0;
ptr_out = sub(ptr_out, len); /* return dest pointer */
jumpr r31;
}
/* do byte copy for small n */
.Lbytes23orless:
{
p3 = sp1loop0(.Lbyte_copy, len);
len = add(len, #-1);
}
.Lbyte_copy:
{
data0 = memb(ptr_in++#1);
if(p3) memb(ptr_out++#1) = data0;
}:endloop0
{
memb(ptr_out) = data0;
ptr_out = sub(ptr_out, len);
jumpr r31;
}
/* do dword copies for aligned in, out and length */
.Ldwordaligned:
{
p3 = sp1loop0(.Ldword_copy, len8);
}
.Ldword_copy:
{
if(p3) memd(ptr_out++#8) = ldata0;
ldata0 = memd(ptr_in++#8);
}:endloop0
{
memd(ptr_out) = ldata0;
ptr_out = sub(ptr_out, len);
jumpr r31; /* return to function caller */
}
.Lmemcpy_return:
r21:20 = memd(sp+#16); /* restore r20+r21 */
{
r25:24 = memd(sp+#8); /* restore r24+r25 */
r17:16 = memd(sp+#0); /* restore r16+r17 */
}
deallocframe; /* restore r31 and incrment stack by 16 */
jumpr r31