linux_dsm_epyc7002/arch/arm
Zhaoxiu Zeng fff7fb0b2d lib/GCD.c: use binary GCD algorithm instead of Euclidean
The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a fast
__ffs (find least significant set bit) instruction is available.  This
allows the unpredictable branches in the bit-at-a-time shifting loop to
be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 10174
  gcd1: elapsed 2120
  gcd2: elapsed 2902
  gcd3: elapsed 2039
  gcd4: elapsed 2812
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9309
  gcd1: elapsed 2280
  gcd2: elapsed 2822
  gcd3: elapsed 2217
  gcd4: elapsed 2710
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9589
  gcd1: elapsed 2098
  gcd2: elapsed 2815
  gcd3: elapsed 2030
  gcd4: elapsed 2718
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9914
  gcd1: elapsed 2309
  gcd2: elapsed 2779
  gcd3: elapsed 2228
  gcd4: elapsed 2709
  PASS

[akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-05-20 17:58:30 -07:00
..
boot Merge branch 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm 2016-05-20 10:01:38 -07:00
common Merge branch 'for-linus' of git://ftp.arm.linux.org.uk/~rmk/linux-arm 2016-03-19 16:31:54 -07:00
configs ARM: SoC defconfig updates for v4.7 2016-05-18 13:07:57 -07:00
crypto Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6 2016-03-17 11:22:54 -07:00
firmware
include Merge branch 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm 2016-05-20 10:01:38 -07:00
kernel printk/nmi: generic solution for safe printk in NMI 2016-05-20 17:58:30 -07:00
kvm Small release overall. 2016-05-19 11:27:09 -07:00
lib
mach-alpine
mach-artpec
mach-asm9260
mach-aspeed arm: Add Aspeed machine 2016-05-09 17:41:14 +09:30
mach-at91 ARM: at91/soc: reference the whole sama5d2 family 2016-03-29 16:34:31 +02:00
mach-axxia
mach-bcm bus: brcmstb_gisb: Rework dependencies 2016-04-18 14:20:30 -07:00
mach-berlin cpufreq: berlin: Use generic platdev driver 2016-04-25 16:18:23 +02:00
mach-clps711x
mach-cns3xxx ARM: SoC cleanups for v4.6 2016-03-20 14:37:22 -07:00
mach-davinci ARM: DT updates for v4.7 2016-05-18 12:48:46 -07:00
mach-digicolor
mach-dove ARM: dove: Remove CLK_IS_ROOT 2016-04-27 13:55:52 +02:00
mach-ebsa110
mach-efm32
mach-ep93xx
mach-exynos ARM: SoC driver updates for v4.7 2016-05-18 13:14:02 -07:00
mach-footbridge
mach-gemini
mach-highbank
mach-hisi
mach-imx ARM: SoC platform updates for v4.7 2016-05-18 12:35:46 -07:00
mach-integrator ARM: integrator: move flash registration to device tree 2016-04-04 10:33:16 +02:00
mach-iop13xx
mach-iop32x
mach-iop33x
mach-ixp4xx This is the bulk of GPIO changes for kernel v4.6: 2016-03-17 21:05:32 -07:00
mach-keystone ARM: provide improved virt_to_idmap() functionality 2016-05-03 11:13:54 +01:00
mach-ks8695 ARM: SoC non-urgent fixes for v4.6 2016-03-20 14:26:57 -07:00
mach-lpc18xx
mach-lpc32xx ARM: SoC platform updates for v4.7 2016-05-18 12:35:46 -07:00
mach-mediatek ARM: SoC driver updates for v4.7 2016-05-18 13:14:02 -07:00
mach-meson
mach-mmp
mach-moxart
mach-mv78xx0 ARM: mv78xx0: Remove CLK_IS_ROOT 2016-04-27 12:42:55 +02:00
mach-mvebu cpufreq: mvebu: Move cpufreq code into drivers/cpufreq/ 2016-04-28 15:22:43 +02:00
mach-mxs
mach-netx ARM: SoC cleanups for v4.6 2016-03-20 14:37:22 -07:00
mach-nomadik
mach-nspire ARM: SoC cleanups for v4.6 2016-03-20 14:37:22 -07:00
mach-omap1 Revert "ARM: OMAP: Catch callers of revision information prior to it being populated" 2016-04-19 08:01:05 -07:00
mach-omap2 ARM: SoC platform updates for v4.7 2016-05-18 12:35:46 -07:00
mach-orion5x ARM: orion5x: Remove CLK_IS_ROOT 2016-04-27 13:11:42 +02:00
mach-oxnas ARM: Add new mach-oxnas 2016-04-26 09:50:52 +02:00
mach-picoxcell
mach-prima2 ARM: SoC cleanups for v4.6 2016-03-20 14:37:22 -07:00
mach-pxa Merge back new device properties material for v4.7. 2016-05-06 22:07:33 +02:00
mach-qcom
mach-realview ARM: realview: hide unused 'pmu_device' object 2016-04-04 10:58:47 +02:00
mach-rockchip ARM: SoC cleanups and fixes for v4.7 2016-05-18 12:28:29 -07:00
mach-rpc
mach-s3c24xx ARM: SoC 64-bit changes for v4.6 2016-03-20 15:08:45 -07:00
mach-s3c64xx ARM: SoC platform updates for v4.6 2016-03-20 14:57:08 -07:00
mach-s5pv210
mach-sa1100 ARM: sa1100: remove references to the defunct handhelds.org 2016-04-12 12:34:15 -07:00
mach-shmobile ARM: SoC driver updates for v4.7 2016-05-18 13:14:02 -07:00
mach-socfpga * Altera Arria10 L2 cache and On-Chip RAM ECC handling. (Thor Thayer) 2016-05-16 18:44:39 -07:00
mach-spear
mach-sti ARM: STi: Update platform level menuconfig 'help' 2016-04-07 16:32:14 +02:00
mach-stm32
mach-sunxi cpufreq: sunxi: Use generic platdev driver 2016-04-25 16:18:24 +02:00
mach-tango
mach-tegra ARM: SoC driver updates for v4.7 2016-05-18 13:14:02 -07:00
mach-u300
mach-uniphier ARM: SoC platform updates for v4.7 2016-05-18 12:35:46 -07:00
mach-ux500 ARM: SoC cleanups for v4.6 2016-03-20 14:37:22 -07:00
mach-versatile ARM: versatile: move flash registration to the device tree 2016-04-04 10:33:16 +02:00
mach-vexpress ARM: vexpress/mps2: introduce MPS2 platform 2016-04-26 12:50:01 +02:00
mach-vt8500
mach-w90x900 Merge branch 'for-linus' of git://ftp.arm.linux.org.uk/~rmk/linux-arm 2016-03-19 16:31:54 -07:00
mach-zx
mach-zynq cpufreq: zynq: Use generic platdev driver 2016-04-25 16:18:24 +02:00
mm lib/GCD.c: use binary GCD algorithm instead of Euclidean 2016-05-20 17:58:30 -07:00
net
nwfpe
oprofile
plat-iop
plat-omap
plat-orion Merge branch 'for-linus' of git://ftp.arm.linux.org.uk/~rmk/linux-arm 2016-03-19 16:31:54 -07:00
plat-pxa
plat-samsung ARM: EXYNOS: Remove SROM related register settings from mach-exynos 2016-04-18 14:25:28 +02:00
plat-versatile
probes
tools ARM: 8562/1: suppress "include/generated/mach-types.h is up to date." 2016-04-19 19:42:47 +01:00
vdso
vfp exit_thread: accept a task parameter to be exited 2016-05-20 17:58:30 -07:00
xen
Kconfig printk/nmi: generic solution for safe printk in NMI 2016-05-20 17:58:30 -07:00
Kconfig-nommu
Kconfig.debug ARM: debug: remove extraneous DEBUG_HI3716_UART option 2016-04-26 13:00:11 +02:00
Makefile ARM: vexpress/mps2: introduce MPS2 platform 2016-04-26 12:50:01 +02:00