linux_dsm_epyc7002/arch/arm/mm
Zhaoxiu Zeng fff7fb0b2d lib/GCD.c: use binary GCD algorithm instead of Euclidean
The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a fast
__ffs (find least significant set bit) instruction is available.  This
allows the unpredictable branches in the bit-at-a-time shifting loop to
be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 10174
  gcd1: elapsed 2120
  gcd2: elapsed 2902
  gcd3: elapsed 2039
  gcd4: elapsed 2812
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9309
  gcd1: elapsed 2280
  gcd2: elapsed 2822
  gcd3: elapsed 2217
  gcd4: elapsed 2710
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9589
  gcd1: elapsed 2098
  gcd2: elapsed 2815
  gcd3: elapsed 2030
  gcd4: elapsed 2718
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9914
  gcd1: elapsed 2309
  gcd2: elapsed 2779
  gcd3: elapsed 2228
  gcd4: elapsed 2709
  PASS

[akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-05-20 17:58:30 -07:00
..
abort-ev4.S ARM: entry: provide uaccess assembly macro hooks 2015-08-26 20:27:02 +01:00
abort-ev4t.S
abort-ev5t.S ARM: entry: provide uaccess assembly macro hooks 2015-08-26 20:27:02 +01:00
abort-ev5tj.S ARM: entry: provide uaccess assembly macro hooks 2015-08-26 20:27:02 +01:00
abort-ev6.S ARM: entry: provide uaccess assembly macro hooks 2015-08-26 20:27:02 +01:00
abort-ev7.S ARM: entry: provide uaccess assembly macro hooks 2015-08-26 20:27:02 +01:00
abort-lv4t.S ARM: entry: provide uaccess assembly macro hooks 2015-08-26 20:27:02 +01:00
abort-macro.S ARM: entry: provide uaccess assembly macro hooks 2015-08-26 20:27:02 +01:00
abort-nommu.S
alignment.c uaccess: reimplement probe_kernel_address() using probe_kernel_read() 2015-11-05 19:34:48 -08:00
cache-aurora-l2.h
cache-fa.S
cache-feroceon-l2.c ARM: 8416/1: Feroceon: use of_iomap() to map register base 2015-08-18 14:00:30 +01:00
cache-l2x0.c ARM: 8569/1: pl2x0: Add OF control of cache power management 2016-05-05 19:02:10 +01:00
cache-nop.S
cache-tauros2.c ARM: l2c: tauros2: use descriptive definitions for register bits 2015-11-26 22:12:26 +00:00
cache-tauros3.h
cache-uniphier.c ARM: 8567/1: cache-uniphier: activate ways for secondary CPUs 2016-05-05 19:03:39 +01:00
cache-v4.S
cache-v4wb.S
cache-v4wt.S
cache-v6.S
cache-v7.S ARM: cache-v7: optimise test for Cortex A9 r0pX devices 2015-04-14 22:26:52 +01:00
cache-xsc3l2.c
context.c ARM: 8465/1: mm: keep reserved ASIDs in sync with mm after multiple rollovers 2015-12-02 23:57:54 +00:00
copypage-fa.c
copypage-feroceon.c
copypage-v4mc.c
copypage-v4wb.c
copypage-v4wt.c
copypage-v6.c
copypage-xsc3.c
copypage-xscale.c
dma-mapping.c Merge branch 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm 2016-05-20 10:01:38 -07:00
dma.h ARM: reduce visibility of dmac_* functions 2015-08-01 22:25:04 +01:00
dump.c ARM: 8249/1: mm: dump: don't skip regions 2015-01-07 20:33:33 +00:00
extable.c
fault-armv.c
fault.c mm: remove VM_FAULT_MINOR 2016-03-17 15:09:34 -07:00
fault.h ARM: 8447/1: catch pending imprecise abort on unmask 2015-10-19 17:08:33 +01:00
flush.c mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros 2016-04-04 10:41:08 -07:00
fsr-2level.c
fsr-3level.c
highmem.c kmap_atomic_to_page() has no users, remove it 2015-11-09 15:11:24 -08:00
hugetlbpage.c mm/hugetlb: reduce arch dependent code about huge_pmd_unshare 2015-06-24 17:49:41 -07:00
idmap.c ARM: provide improved virt_to_idmap() functionality 2016-05-03 11:13:54 +01:00
init.c ARM: 8502/1: mm: mark section-aligned portion of rodata NX 2016-02-11 15:44:10 +00:00
iomap.c
ioremap.c ARM: memremap: implement arch_memremap_wb() 2016-04-04 10:26:42 +02:00
Kconfig lib/GCD.c: use binary GCD algorithm instead of Euclidean 2016-05-20 17:58:30 -07:00
l2c-common.c
l2c-l2x0-resume.S
Makefile ARM: uniphier: add outer cache support 2015-10-27 09:20:50 +09:00
mm.h
mmap.c mm: ASLR: use get_random_long() 2016-02-27 10:28:52 -08:00
mmu.c Merge branch 'for-linus' of git://ftp.arm.linux.org.uk/~rmk/linux-arm 2016-03-19 16:31:54 -07:00
nommu.c Merge branch 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm 2016-05-20 10:01:38 -07:00
pabort-legacy.S
pabort-v6.S
pabort-v7.S
pageattr.c ARM: 8544/1: set_memory_xx fixes 2016-03-04 23:32:45 +00:00
pgd.c mm: cleanup *pte_alloc* interfaces 2016-03-17 15:09:34 -07:00
proc-arm7tdmi.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-arm9tdmi.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-arm720.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-arm740.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-arm920.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-arm922.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-arm925.S ARM: 8349/1: arch/arm/mm/proc-arm925.S: remove dead #ifdef block 2015-05-03 23:22:27 +01:00
proc-arm926.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-arm940.S Merge branches 'misc', 'vdso' and 'fixes' into for-next 2015-04-14 22:28:25 +01:00
proc-arm946.S Merge branches 'misc', 'vdso' and 'fixes' into for-next 2015-04-14 22:28:25 +01:00
proc-arm1020.S ARM: 8348/1: remove comments on CPU_ARM1020_CPU_IDLE 2015-05-03 23:22:09 +01:00
proc-arm1020e.S ARM: 8348/1: remove comments on CPU_ARM1020_CPU_IDLE 2015-05-03 23:22:09 +01:00
proc-arm1022.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-arm1026.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-fa526.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-feroceon.S ARM: 8350/1: proc-feroceon: Fix feroceon_proc_info macro 2015-05-03 23:23:09 +01:00
proc-macros.S Merge branches 'misc', 'vdso' and 'fixes' into for-next 2015-04-14 22:28:25 +01:00
proc-mohawk.S ARM: mohawk: allow building with MMU disabled 2015-12-01 21:44:25 +01:00
proc-sa110.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-sa1100.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-syms.c
proc-v6.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-v7-2level.S Merge branches 'arnd-fixes', 'clk', 'misc', 'v7' and 'fixes' into for-next 2015-06-12 21:18:08 +01:00
proc-v7-3level.S ARM: redo TTBR setup code for LPAE 2015-06-01 23:48:19 +01:00
proc-v7.S ARM: SMP enable of cache maintanence broadcast 2016-04-01 23:27:47 +01:00
proc-v7m.S ARM: 8451/1: v7-M: Set an early stack for __v7m_setup 2015-11-16 18:34:38 +00:00
proc-xsc3.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
proc-xscale.S ARM: 8314/1: replace PROCINFO embedded branch with relative offset 2015-03-28 15:46:14 +00:00
pv-fixup-asm.S ARM: re-implement physical address space switching 2015-06-01 23:46:33 +01:00
tcm.h
tlb-fa.S
tlb-v4.S
tlb-v4wb.S
tlb-v4wbi.S
tlb-v6.S
tlb-v7.S