linux_dsm_epyc7002/arch/powerpc/lib/vmx-helper.c
Simon Guo d58badfb7c powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
This patch add VMX primitives to do memcmp() in case the compare size
is equal or greater than 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
------
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include <malloc.h>
>#include <stdlib.h>
>#include <string.h>
>#include <time.h>
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
        char *s1;
        char *s2;
        unsigned long i;

        s1 = memalign(128, SIZE);
        if (!s1) {
                perror("memalign");
                exit(1);
        }

        s2 = memalign(128, SIZE);
        if (!s2) {
                perror("memalign");
                exit(1);
        }

        for (i = 0; i < SIZE; i++)  {
                s1[i] = i & 0xff;
                s2[i] = i & 0xff;
        }
        for (i = 0; i < ITERATIONS; i++) {
		int ret = test_memcmp(s1, s2, SIZE);

		if (ret) {
			printf("return %d at[%ld]! should have returned zero\n", ret, i);
			abort();
		}
	}

        return 0;
}

int main(void)
{
        return test_harness(testcase, "memcmp");
}
------
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
	4.726728762 seconds time elapsed                                          ( +-  3.54%)
With VMX patch:
	4.234335473 seconds time elapsed                                          ( +-  2.63%)
		There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-07-24 22:03:21 +10:00

79 lines
1.9 KiB
C

/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2011
*
* Authors: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
* Anton Blanchard <anton@au.ibm.com>
*/
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <asm/switch_to.h>
#include <asm/asm-prototypes.h>
int enter_vmx_usercopy(void)
{
if (in_interrupt())
return 0;
preempt_disable();
/*
* We need to disable page faults as they can call schedule and
* thus make us lose the VMX context. So on page faults, we just
* fail which will cause a fallback to the normal non-vmx copy.
*/
pagefault_disable();
enable_kernel_altivec();
return 1;
}
/*
* This function must return 0 because we tail call optimise when calling
* from __copy_tofrom_user_power7 which returns 0 on success.
*/
int exit_vmx_usercopy(void)
{
disable_kernel_altivec();
pagefault_enable();
preempt_enable();
return 0;
}
int enter_vmx_ops(void)
{
if (in_interrupt())
return 0;
preempt_disable();
enable_kernel_altivec();
return 1;
}
/*
* All calls to this function will be optimised into tail calls. We are
* passed a pointer to the destination which we return as required by a
* memcpy implementation.
*/
void *exit_vmx_ops(void *dest)
{
disable_kernel_altivec();
preempt_enable();
return dest;
}