[libmath-prime-util-perl] 17/72: Allow parallel phi sum by using critical section around phicache
Partha P. Mukherjee
ppm-guest at moszumanska.debian.org
Thu May 21 18:49:37 UTC 2015
This is an automated email from the git hooks/post-receive script.
ppm-guest pushed a commit to annotated tag v0.32
in repository libmath-prime-util-perl.
commit c24f6f695980a7da50ef9c3fc605d3a679e9cad1
Author: Dana Jacobsen <dana at acm.org>
Date: Thu Aug 29 18:21:49 2013 -0700
Allow parallel phi sum by using critical section around phicache
---
Changes | 5 ++
lehmer.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------
2 files changed, 218 insertions(+), 54 deletions(-)
diff --git a/Changes b/Changes
index b64d8a8..499227f 100644
--- a/Changes
+++ b/Changes
@@ -12,6 +12,11 @@ Revision history for Perl module Math::Prime::Util
- Use MPU::GMP::pn_primorial if we have it.
+ - Make LMO primecount work (albeit a simplistic version). Memory
+ limiting features added to phi calculations, so the phi calculations
+ should stay under ~150MB even with giant sizes.
+ Thanks to Kim Walisch for all discussions about this.
+
0.31 2013-08-07
- Change proof certificate documentation to reflect the new text format.
diff --git a/lehmer.c b/lehmer.c
index a141d76..48958d0 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -5,6 +5,7 @@
/* Below this size, just sieve. */
#define SIEVE_LIMIT 60000000
+#define MAX_PHI_MEM (128*1024*1024)
/*****************************************************************************
*
@@ -110,6 +111,11 @@ static int const verbose = 0;
#define SET_PPS_SERIAL /* */
#endif
+#ifdef _OPENMP
+ #include <omp.h>
+ int omp_threads = 8; /* will get replaced */
+#endif
+
/* Translations from Perl + Math::Prime::Util to C/C++ + primesieve */
typedef unsigned long UV;
typedef signed long IV;
@@ -151,6 +157,8 @@ void primesieve_callback(uint64_t pk) {
/* Generate an array of n small primes, where the kth prime is element p[k].
* Remember to free when done. */
+#define TINY_PRIME_SIZE 20000
+static UV* tiny_primes = 0;
static UV* generate_small_primes(UV n)
{
UV* primes;
@@ -165,6 +173,12 @@ static UV* generate_small_primes(UV n)
New(0, primes, n+1, UV);
if (primes == 0)
croak("Can not allocate small primes\n");
+ if (n < TINY_PRIME_SIZE) {
+ if (tiny_primes == 0)
+ tiny_primes = generate_small_primes(TINY_PRIME_SIZE+1);
+ memcpy(primes, tiny_primes, (n+1) * sizeof(UV));
+ return primes;
+ }
primes[0] = 0;
sieve_array = primes;
sieve_n = n;
@@ -279,6 +293,8 @@ static UV bs_prime_count(UV n, UV const* const primes, UV lastprime)
return i-1;
}
+#define FAST_DIV(x,y) \
+ ( ((x) <= 4294967295) ? (uint32_t)(x)/(uint32_t)(y) : (x)/(y) )
/* Use Mapes' method to calculate phi(x,a) for small a. This is really
* convenient and a little Perl script will spit this code out for whatever
@@ -298,8 +314,9 @@ static UV mapes(UV x, UV a)
return (UV) val;
}
-static UV mapes7(UV x) { /* A tiny bit faster setup for a=7 */
- IV val = x-x/2-x/3-x/5+x/6-x/7+x/10-x/11-x/13+x/14+x/15-x/17+x/21+x/22+x/26
+#define mapes7(x) ((x <= 4294967295) ? mapes7_32(x) : mapes7_64(x))
+static UV mapes7_64(UV x) { /* A tiny bit faster setup for a=7 */
+ UV val = x-x/2-x/3-x/5+x/6-x/7+x/10-x/11-x/13+x/14+x/15-x/17+x/21+x/22+x/26
-x/30+x/33+x/34+x/35+x/39-x/42+x/51+x/55+x/65-x/66-x/70+x/77-x/78
+x/85+x/91-x/102-x/105-x/110+x/119-x/130+x/143-x/154-x/165-x/170
-x/182+x/187-x/195+x/210+x/221-x/231-x/238-x/255-x/273-x/286+x/330
@@ -316,9 +333,115 @@ static UV mapes7(UV x) { /* A tiny bit faster setup for a=7 */
-x/19635-x/23205-x/24310+x/30030-x/34034-x/36465+x/39270+x/46410
-x/51051+x/72930-x/85085+x/102102+x/170170+x/255255-x/510510;
}
- return (UV) val;
+ return val;
+}
+
+static uint32_t mapes7_32(uint32_t x) {
+ uint32_t val = x-x/2-x/3-x/5+x/6-x/7+x/10-x/11-x/13+x/14+x/15-x/17+x/21
+ +x/22+x/26-x/30+x/33+x/34+x/35+x/39-x/42+x/51+x/55+x/65-x/66-x/70
+ +x/77-x/78+x/85+x/91-x/102-x/105-x/110+x/119-x/130+x/143-x/154
+ -x/165-x/170-x/182+x/187-x/195+x/210+x/221-x/231-x/238-x/255-x/273
+ -x/286+x/330-x/357-x/374-x/385+x/390-x/429-x/442-x/455+x/462+x/510
+ +x/546-x/561-x/595-x/663+x/714;
+ if (x >= 715) {
+ val += 0-x/715+x/770+x/858+x/910-x/935-x/1001-x/1105+x/1122+x/1155+x/1190
+ -x/1309+x/1326+x/1365+x/1430-x/1547+x/1785+x/1870+x/2002+x/2145
+ +x/2210-x/2310-x/2431+x/2618-x/2730+x/2805+x/3003+x/3094+x/3315
+ -x/3570+x/3927-x/4290+x/4641+x/4862+x/5005-x/5610-x/6006+x/6545
+ -x/6630+x/7293+x/7735-x/7854;
+ if (x >= 9282)
+ val += 0-x/9282-x/10010+x/12155-x/13090-x/14586-x/15015-x/15470+x/17017
+ -x/19635-x/23205-x/24310+x/30030-x/34034-x/36465+x/39270+x/46410
+ -x/51051+x/72930-x/85085+x/102102+x/170170+x/255255-x/510510;
+ }
+ return val;
}
+/* Max memory = 2*A*X bytes, e.g. 2*400*16000 = 12.2 MB */
+#define PHICACHEA 400
+#define PHICACHEX 16000
+static uint32_t _phicache_max[PHICACHEA] = {0};
+static int16_t* _phicache[PHICACHEA] = {0};
+static void phicache_free(void) {
+ int a;
+#ifdef _OPENMP
+ #pragma omp critical(phicache)
+#endif
+ for (a = 0; a < PHICACHEA; a++) {
+ if (_phicache[a] != 0)
+ Safefree(_phicache[a]);
+ _phicache[a] = 0;
+ _phicache_max[a] = 0;
+ }
+}
+#define PHI_CACHE_POPULATED(x, a) \
+ ((a) < PHICACHEA && (x) < PHICACHEX && \
+ _phicache_max[a] > (x) && _phicache[a][x] != 0)
+
+static void phi_cache_insert(UV x, UV a, IV sum) {
+#ifdef _OPENMP
+ #pragma omp critical(phicache)
+#endif
+ if (a < PHICACHEA && x < PHICACHEX) {
+ uint32_t cap = ((x+1+31)/32)*32;
+ if (_phicache[a] == 0) {
+ Newz(0, _phicache[a], cap, int16_t);
+ _phicache_max[a] = cap;
+ } else if (_phicache_max[a] < cap) {
+ uint32_t i;
+ Renew(_phicache[a], cap, int16_t);
+ for (i = _phicache_max[a]; i < cap; i++)
+ _phicache[a][i] = 0;
+ _phicache_max[a] = cap;
+ }
+ if (sum < SHRT_MIN || sum > SHRT_MAX)
+ croak("phi(%lu,%lu) 16-bit overflow: sum = %ld\n", x, a, sum);
+ _phicache[a][x] = sum;
+ }
+}
+static size_t phicache_size(void) {
+ size_t sum = sizeof(_phicache);
+ int a;
+#ifdef _OPENMP
+ #pragma omp critical(phicache)
+#endif
+ for (a = 0; a < PHICACHEA; a++)
+ if (_phicache[a] != 0)
+ sum += _phicache_max[a] * sizeof(_phicache[a][0]);
+ return sum;
+}
+
+static IV _phi3(UV x, UV a, int sign, const UV* primes)
+{
+ IV sum;
+
+ if (a < 3)
+ return sign * ((a==0) ? x : (a==1) ? x-x/2 : x-x/2-x/3+x/6);
+ else if (PHI_CACHE_POPULATED(x, a))
+ return sign * _phicache[a][x];
+ else if (x < primes[a+1])
+ sum = sign;
+ else {
+ UV a2;
+ if (a*a > x) {
+ UV ix = isqrt(x);
+ UV iters = bs_prime_count(ix, primes, a+1);
+ sum = sign * (x - a + iters);
+ for (a2 = 1; a2 <= iters; a2++)
+ sum += _phi3( FAST_DIV(x, primes[a2]), a2-1, -sign, primes);
+ } else {
+ sum = 0;
+ for (a2 = 8; a2 <= a; a2++)
+ sum += _phi3( FAST_DIV(x,primes[a2]), a2-1, -sign, primes);
+ a2 = (a > 7) ? 7 : a;
+ sum += sign * (PHI_CACHE_POPULATED(x, a2) ? _phicache[a2][x] : mapes(x, a2));
+ }
+ }
+ phi_cache_insert(x, a, sign * sum);
+ return sum;
+}
+#define phi_small(x, a, primes) _phi3(x, a, 1, primes)
+
/******************************************************************************/
/* In-order lists for manipulating our UV value / IV count pairs */
/******************************************************************************/
@@ -362,11 +485,11 @@ static void vcarray_insert(vcarray_t* l, UV val, IV count)
UV new_size;
if (l->size == 0) {
new_size = 20000;
- if (verbose>2) printf("ALLOCing list, size %lu\n", new_size);
+ if (verbose>2) printf("ALLOCing list, size %lu (%luk)\n", new_size, new_size*sizeof(vc_t)/1024);
New(0, l->a, new_size, vc_t);
} else {
new_size = (UV) (1.5 * l->size);
- if (verbose>2) printf("REALLOCing list %p, new size %lu\n",l->a,new_size);
+ if (verbose>2) printf("REALLOCing list %p, new size %lu (%luk)\n",l->a,new_size, new_size*sizeof(vc_t)/1024);
Renew( l->a, new_size, vc_t );
}
if (l->a == 0) croak("could not allocate list\n");
@@ -410,7 +533,7 @@ static void vcarray_merge(vcarray_t* a, vcarray_t* b)
else
ba[bj++] = ba[bi];
}
- if (verbose>2) printf(" removed %lu duplicates from b\n", bn - bj);
+ if (verbose>3) printf(" removed %lu duplicates from b\n", bn - bj);
bn = bj;
if (bn == 0) { /* In case they were all duplicates */
@@ -422,7 +545,7 @@ static void vcarray_merge(vcarray_t* a, vcarray_t* b)
kn = an+bn;
if ((long)a->size < kn) { /* Make A big enough to hold kn elements */
UV new_size = (UV) (1.2 * kn);
- if (verbose>2) printf("REALLOCing list %p, new size %lu\n", a->a, new_size);
+ if (verbose>2) printf("REALLOCing list %p, new size %lu (%luk)\n", a->a, new_size, new_size*sizeof(vc_t)/1024);
Renew( a->a, new_size, vc_t );
aa = a->a; /* this could have been changed by the realloc */
a->size = new_size;
@@ -448,6 +571,24 @@ static void vcarray_merge(vcarray_t* a, vcarray_t* b)
b->n = 0; /* B is marked empty */
}
+static void vcarray_remove_zeros(vcarray_t* a)
+{
+ long ai = 0;
+ long aj = 0;
+ long an = a->n;
+ vc_t* aa = a->a;
+
+ while (aj < an) {
+ if (aa[aj].c != 0) {
+ if (ai != aj)
+ aa[ai] = aa[aj];
+ ai++;
+ }
+ aj++;
+ }
+ a->n = ai;
+}
+
/*
* The main phi(x,a) algorithm. In this implementation, it takes under 10%
@@ -475,7 +616,7 @@ static UV phi(UV x, UV a)
a2 = vcarray_create();
vcarray_insert(&a1, x, 1);
- while (a > 7) {
+ while (a > 7 && a1.n < (MAX_PHI_MEM/16)) {
UV primea = primes[a];
UV sval_last = 0;
IV sval_count = 0;
@@ -484,7 +625,7 @@ static UV phi(UV x, UV a)
count = arr[i].c;
if (count == 0) continue; /* Skip if count = 0 */
val = arr[i].v;
- sval = val / primea;
+ sval = FAST_DIV(val, primea);
if (sval < primea) break; /* stop inserting into a2 if small */
if (sval != sval_last) { /* non-merged value. Insert into a2 */
if (sval_last != 0)
@@ -497,19 +638,30 @@ static UV phi(UV x, UV a)
if (sval_last != 0) /* Insert the last sval */
vcarray_insert(&a2, sval_last, sval_count);
/* For each small sval, add up the counts */
- for ( ; i < a1.n; i++)
+ for ( ; i < a1.n; i++)
sum -= arr[i].c;
/* Merge a1 and a2 into a1. a2 will be emptied. */
vcarray_merge(&a1, &a2);
+ vcarray_remove_zeros(&a1);
a--;
}
vcarray_destroy(&a2);
- if (a != 7) croak("final loop is set for a=7, a = %lu\n", a);
arr = a1.a;
- for (i = 0; i < a1.n; i++) {
- count = arr[i].c;
- if (count != 0)
- sum += count * mapes7( arr[i].v );
+ if (a > 7) {
+ if (verbose > 0) printf("CUT TO SMALL PHI AT a = %lu\n", a);
+#ifdef _OPENMP
+ omp_threads = omp_get_max_threads();
+ #pragma omp parallel for reduction(+: sum) num_threads(omp_threads) schedule(dynamic, 16)
+#endif
+ for (i = 0; i < a1.n; i++)
+ sum += arr[i].c * phi_small( arr[i].v, a, primes );
+ } else {
+#ifdef _OPENMP
+ omp_threads = omp_get_max_threads();
+ #pragma omp parallel for reduction(+: sum) num_threads(omp_threads) schedule(dynamic, 16)
+#endif
+ for (i = 0; i < a1.n; i++)
+ sum += arr[i].c * mapes7( arr[i].v );
}
vcarray_destroy(&a1);
Safefree(primes);
@@ -524,13 +676,23 @@ static UV phi(UV x, UV a)
* method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */
UV _XS_legendre_pi(UV n)
{
- UV a;
+ UV a, phina;
+ DECLARE_TIMING_VARIABLES;
if (n < SIEVE_LIMIT)
return _XS_prime_count(2, n);
+ if (verbose > 0) printf("legendre %lu stage 1: calculate a \n", n);
+ TIMING_START;
a = _XS_legendre_pi(isqrt(n));
+ TIMING_END_PRINT("stage 1")
- return phi(n, a) + a - 1;
+ if (verbose > 0) printf("legendre %lu stage 2: phi(x,a) (a=%lu)\n", n, a);
+ TIMING_START;
+ phina = phi(n, a);
+ if (verbose > 0) printf("phicache size: %lu\n", (unsigned long)phicache_size());
+ phicache_free();
+ TIMING_END_PRINT("stage 2")
+ return phina + a - 1;
}
@@ -553,6 +715,8 @@ UV _XS_meissel_pi(UV n)
if (verbose > 0) printf("meissel %lu stage 2: phi(x,a) (a=%lu b=%lu c=%lu)\n", n, a, b, c);
TIMING_START;
sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2);
+ if (verbose > 1) printf("phicache size: %lu\n", (unsigned long)phicache_size());
+ phicache_free();
if (verbose > 0) printf("phi(%lu,%lu) = %lu. sum = %lu\n", n, a, sum - ((b+a-2) * (b-a+1) / 2), sum);
TIMING_END_PRINT("phi(x,a)")
@@ -614,6 +778,8 @@ UV _XS_lehmer_pi(UV n)
if (verbose > 0) printf("lehmer %lu stage 2: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
TIMING_START;
sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2);
+ if (verbose > 1) printf("phicache size: %lu\n", (unsigned long)phicache_size());
+ phicache_free();
TIMING_END_PRINT("phi(x,a)")
/* We get an array of the first b primes. This is used in stage 4. If we
@@ -627,7 +793,6 @@ UV _XS_lehmer_pi(UV n)
lastpc = primes[lastprime];
TIMING_END_PRINT("small primes")
-
TIMING_START;
/* Speed up all the prime counts by doing a big base sieve */
prime_precalc( (UV) pow(n, 3.0/5.0) );
@@ -661,15 +826,14 @@ UV _XS_lehmer_pi(UV n)
}
+/* The Lagarias-Miller-Odlyzko method. Very simple implementation without
+ * many optimizations. It is very nice for memory use, but it isn't
+ * particularly fast. */
UV _XS_LMO_pi(UV n)
{
-#if 0
- UV a, b, sum, i, lastprime, lastpc, lastw, lastwpc;
- UV n13, n12, n23;
- IV S1;
- UV S2, P2;
+ UV n12, n13, a, b, sum, i, j, lastprime, lastpc, lastw, lastwpc, P2, S1, S2;
const UV* primes = 0; /* small prime cache */
- signed char* mu = 0; /* moebius to n^1/3 */
+ signed char* mu = 0; /* moebius to n^1/3 */
UV* lpf = 0; /* least prime factor to n^1/3 */
DECLARE_TIMING_VARIABLES;
if (n < SIEVE_LIMIT)
@@ -679,12 +843,12 @@ UV _XS_LMO_pi(UV n)
TIMING_START;
n13 = icbrt(n);
n12 = isqrt(n);
- n23 = (UV) (pow(n, 2.0/3.0)+0.01);
a = _XS_lehmer_pi(n13);
b = _XS_lehmer_pi(n12);
TIMING_END_PRINT("stage 1")
lastprime = b*SIEVE_MULT+1;
+ if (lastprime < n13) lastprime = n13;
if (verbose > 0) printf("LMO %lu stage 2: %lu small primes\n", n, lastprime);
TIMING_START;
primes = generate_small_primes(lastprime);
@@ -692,17 +856,13 @@ UV _XS_LMO_pi(UV n)
lastpc = primes[lastprime];
TIMING_END_PRINT("small primes")
- if (verbose > 0) printf("LMO %lu stage 3: calculate mu/lpf to %lu\n", n, a);
+ if (verbose > 0) printf("LMO %lu stage 3: moebius and lpf\n", n);
TIMING_START;
- /* We could call MPU's:
- * mu = _moebius_range(0, n13+1)
- * but we will do the least prime factor calculation at the same time.
- */
New(0, mu, n13+1, signed char);
memset(mu, 1, sizeof(char) * (n13+1));
Newz(0, lpf, n13+1, UV);
mu[0] = 0;
- for (i = 1; i <= a; i++) {
+ for (i = 1; i <= n13; i++) {
UV primei = primes[i];
UV j, isquared;
for (j = primei; j <= n13; j += primei) {
@@ -713,31 +873,33 @@ UV _XS_LMO_pi(UV n)
for (j = isquared; j <= n13; j += isquared)
mu[j] = 0;
}
- /* for (i = 0; i <= n13; i++) { printf("mu %lu %ld\n", i, (IV)mu[i]); } */
- TIMING_END_PRINT("mu")
+ lpf[1] = UV_MAX; /* Set lpf[1] to max */
+ TIMING_END_PRINT("moebius and lpf")
- if (verbose > 0) printf("LMO %lu stage 4: calculate S1 (%lu)\n", n, n13);
+ /* Thanks to Kim Walisch for help with the S1+S2 calculations. */
+ if (verbose > 0) printf("LMO %lu stage 4: S1 and S2\n", n);
TIMING_START;
+ UV k = (a < 7) ? a : 7;
S1 = 0;
- for (i = 1; i <= n13; i++)
- if (mu[i] != 0)
- S1 += mu[i] * (IV) (n/i);
- TIMING_END_PRINT("S1")
- if (verbose > 0) printf("LMO %lu stage 4: S1 = %ld\n", n, S1);
-
S2 = 0;
- /* TODO... */
-
- Safefree(mu);
- Safefree(lpf);
-
- prime_precalc(isqrt(n / primes[a+1]));
- if (verbose > 0) printf("LMO %lu stage 5: P2 loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]);
+ for (i = 1; i <= n13; i++)
+ if (lpf[i] > primes[k])
+ S1 += mu[i] * mapes(n/i, k);
+ for (i = k; i+1 < a; i++)
+ for (j = (n13/primes[i+1])+1; j <= n13; j++)
+ if (lpf[j] > primes[i+1])
+ S2 -= mu[j] * phi_small(n / (j*primes[i+1]), i, primes);
+ if (verbose > 0) printf("phicache size: %lu\n", (unsigned long)phicache_size());
+ phicache_free();
+ TIMING_END_PRINT("S1 and S2")
+
+ if (verbose > 0) printf("LMO %lu stage 5: P2\n", n);
TIMING_START;
- P2 = 0;
+ prime_precalc(isqrt(n / primes[a+1]));
/* Reverse the i loop so w increases. Count w in segments. */
lastw = 0;
lastwpc = 0;
+ P2 = 0;
for (i = b; i > a; i--) {
UV w = n / primes[i];
lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime)
@@ -746,14 +908,11 @@ UV _XS_LMO_pi(UV n)
P2 += lastwpc;
}
P2 -= ((b+a-2) * (b-a+1) / 2) - a + 1;
- TIMING_END_PRINT("P2")
- if (verbose > 0) printf("LMO %lu stage 5: P2 = %lu\n", n, P2);
+ TIMING_END_PRINT("stage 5 (P2)")
+ /* printf("S1 = %lu\nS2 = %lu\na = %lu\nP2 = %lu\n", S1, S2, a, P2); */
+ sum = (S1 + S2) + a - 1 - P2;
Safefree(primes);
- sum = P2 + S1 + S2;
return sum;
-#else
- croak("not implemented: LMO(%lu)", (unsigned long) n);
-#endif
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git
More information about the Pkg-perl-cvs-commits
mailing list