[libmath-prime-util-perl] 49/50: Fix prime count issue and make standalone. Add Meissel method.

Thu May 21 18:45:41 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.13
in repository libmath-prime-util-perl.

commit b20511442938a45ac67340916f2455f6e2b66078
Author: Dana Jacobsen <dana at acm.org>
Date:   Mon Nov 19 00:08:40 2012 -0800

    Fix prime count issue and make standalone.  Add Meissel method.
---
 XS.xs             |   6 +
 lehmer.c          | 369 ++++++++++++++++++++++++++++++++++++++----------------
 lehmer.h          |   1 +
 t/13-primecount.t |   6 +-
 4 files changed, 271 insertions(+), 111 deletions(-)

diff --git a/XS.xs b/XS.xs
index d61f012..5c69ba7 100644
--- a/XS.xs
+++ b/XS.xs
@@ -42,6 +42,12 @@ _XS_prime_count(IN UV low, IN UV high = 0)
     RETVAL
 
 UV
+_XS_legendre_pi(IN UV n)
+
+UV
+_XS_meissel_pi(IN UV n)
+
+UV
 _XS_lehmer_pi(IN UV n)
 
 UV
diff --git a/lehmer.c b/lehmer.c
index 9a38c95..facc3d7 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -3,10 +3,7 @@
 #include <string.h>
 #include <math.h>
 
-#include "lehmer.h"
-#include "util.h"
-#include "cache.h"
-#include "sieve.h"
+#define SIEVE_LIMIT  1000000    /* Just sieve if smaller than this */
 
 /*****************************************************************************
  *
@@ -16,13 +13,14 @@
  * This is free software; you can redistribute it and/or modify it under
  * the same terms as the Perl 5 programming language system itself.
  *
- * This file is part of the Math::Prime::Util Perl module.  It relies on two
- * features: (1) a sieve to generate very small primes which could be as
- * simple as a three-line SoE, and (2) _XS_prime_count(low, high) is expected
- * to return the prime count within the segment low to high inclusive.  These
- * are used for the relatively small segments in the final stage, but making
- * it fast is important for the final performance.  The primesieve package is
- * one source of excellent routines for either task.
+ * This file is part of the Math::Prime::Util Perl module, but also can be
+ * compiled as a standalone UNIX program using the primesieve package.
+ *
+ *    g++ -O3 -DPRIMESIEVE_STANDALONE lehmer.c -o prime_count -lprimesieve
+ *
+ * For faster prime counting in stage 4 with multiprocessor machines:
+ *
+ *    g++ -O3 -DPRIMESIEVE_STANDALONE -DPRIMESIEVE_PARALLEL lehmer.c -o prime_count -lprimesieve -lgomp
  *
  * The phi(x,a) calculation is unique, to the best of my knowledge.  It keeps
  * a heap of all x values + signed counts for the given 'a' value, and walks
@@ -35,6 +33,8 @@
  * Calculating pi(10^11) is done in under 1 second on my computer.  pi(10^14)
  * takes under 1 minute, pi(10^16) in a half hour.  Compared with Thomas
  * R. Nicely's pix4 program, this one is 3-5x faster and uses 2-3x less memory.
+ * When compiled with parallel primesieve it is another 2x or more faster:
+ * pix4(10^16) takes 124 minutes, this takes 6 minutes.
  *
  *    n     phi(x,a) mem/time  |  stage 4 mem/time  | total time
  *   10^17   4953MB    871.14   |   2988MB  9911.9    | 179m 37.5s
@@ -46,13 +46,16 @@
  *   10^11               0.034  |              0.138  |     0.213s
  *   10^10               0.007  |              0.025  |     0.064s
  *
+ * These timings are using Perl + MPU.  The standalone version using primesieve
+ * speeds up stage 4 a lot for large values.
+ *
  * Reference: Hans Riesel, "Prime Numbers and Computer Methods for
  * Factorization", 2nd edition, 1994.
  */
 
 static int const verbose = 0;
-#define DO_TIMING 0
-#if DO_TIMING
+
+#ifdef STAGE_TIMING
  #include <sys/time.h>
  #define DECLARE_TIMING_VARIABLES  struct timeval t0, t1;
  #define TIMING_START   gettimeofday(&t0, 0);
@@ -67,6 +70,122 @@ static int const verbose = 0;
  #define TIMING_END_PRINT(text)
 #endif
 
+
+#ifdef PRIMESIEVE_STANDALONE
+
+#include <limits.h>
+#include <sys/time.h>
+#ifdef PRIMESIEVE_PARALLEL
+ #include <primesieve/soe/ParallelPrimeSieve.h>
+ ParallelPrimeSieve ps;
+#else
+ #include <primesieve/soe/PrimeSieve.h>
+ PrimeSieve ps;
+#endif
+
+/* Translations from Perl + Math::Prime::Util  to  C/C++ + primesieve */
+typedef unsigned long UV;
+typedef   signed long IV;
+#define UV_MAX ULONG_MAX
+#define New(id, mem, size, type)  mem = (type*) malloc((size)*sizeof(type))
+#define Newz(id, mem, size, type) mem = (type*) calloc(size, sizeof(type))
+#define Renew(mem, size, type)    mem = (type*) realloc(mem,(size)*sizeof(type))
+#define Safefree(mem)             free((void*)mem)
+#define _XS_prime_count(a, b)     ps.countPrimes(a, b)
+#define croak(fmt,...)            { printf(fmt,##__VA_ARGS__); exit(1); }
+#define prime_precalc(n)          /* */
+
+/* There has _got_ to be a better way to get an array of small primes using
+ * primesieve.  This is ridiculous. */
+static UV* sieve_array = 0;
+static UV sieve_k;
+static UV sieve_n;
+void primesieve_callback(uint64_t pk)
+  { if (sieve_k <= sieve_n) sieve_array[sieve_k++] = pk; }
+
+/* Generate an array of small primes up to and including n, where the kth
+ * prime is element p[k].  Remember to free when done. */
+static UV* generate_small_primes(UV n)
+{
+  UV* primes;
+  UV nth_prime = (n <= 10) ? 29 : n * ( log(n) + log(log(n)) ) + 1;
+  New(0, primes, n+1, UV);
+  if (primes == 0)
+    croak("Can not allocate small primes\n");
+  primes[0] = 0;
+  sieve_array = primes;
+  sieve_n = n;
+  sieve_k = 1;
+  ps.generatePrimes(2, nth_prime, primesieve_callback);
+  sieve_array = 0;
+  return primes;
+}
+
+#else
+
+#include "lehmer.h"
+#include "util.h"
+#include "cache.h"
+#include "sieve.h"
+
+/* Generate an array of small primes up to and including n, where the kth
+ * prime is element p[k].  Remember to free when done. */
+static UV* generate_small_primes(UV n)
+{
+  const unsigned char* sieve;
+  UV* primes;
+  UV  i, nth_prime;
+
+  /* Dusart 1999 bound */
+  nth_prime = (n <= 10) ? 29 : n * ( log(n) + log(log(n)) ) + 1;
+
+  if (get_prime_cache(nth_prime, &sieve) < nth_prime) {
+    release_prime_cache(sieve);
+    croak("Could not generate sieve for %"UVuf, nth_prime);
+  }
+  New(0, primes, n+1, UV);
+  if (primes == 0)
+    croak("Can not allocate small primes\n");
+  primes[0] = 0; primes[1] = 2; primes[2] = 3; primes[3] = 5;
+  i = 3;
+  START_DO_FOR_EACH_SIEVE_PRIME( sieve, 7, nth_prime ) {
+    if (i >= n) break;
+    primes[++i] = p;
+  } END_DO_FOR_EACH_SIEVE_PRIME
+  release_prime_cache(sieve);
+  if (i < n)
+    croak("Did not generate enough small primes.\n");
+  if (verbose > 1) printf("generated %lu small primes, from 2 to %lu\n", i, primes[i]);
+  return primes;
+}
+
+#endif
+
+/* Given an array of primes[1..lastprime], return Pi(n) where n <= lastprime.
+ * This is actually quite fast, and definitely faster than sieving.  By using
+ * this we can avoid caching prime counts and also skip most calls to the
+ * segment siever.
+ */
+static UV bs_prime_count(UV n, UV const* const primes, UV lastprime)
+{
+  UV i, j;
+  if (n < 2)  return 0;
+  /* if (n > primes[lastprime])  return _XS_prime_count(2, n); */
+  if (n >= primes[lastprime]) {
+    if (n == primes[lastprime]) return lastprime;
+    croak("called bspc(%lu) with counts up to %lu\n", n, primes[lastprime]);
+  }
+  i = 1;
+  j = lastprime;
+  while (i < j) {
+    UV mid = (i+j)/2;
+    if (primes[mid] <= n)  i = mid+1;
+    else                   j = mid;
+  }
+  return i-1;
+}
+
+
 /* Use Mapes' method to calculate phi(x,a) for small a.  This is really
  * convenient and a little Perl script will spit this code out for whatever
  * limit we select.  It gets unwieldy with large a values.
@@ -270,24 +389,32 @@ static void heap_destroy(heap_t* h)
  * memory consuming part.
  */
 
-static UV phi(UV x, UV a, UV const* const primes)
+static UV phi(UV x, UV a)
 {
   heap_t h1, h2;
-  UV val;
+  UV val, smallv;
   IV count, sum = 0;
-  UV primea = primes ? primes[a+1] : _XS_nth_prime(a+1);
-  if (x < primea)  return (x > 0) ? 1 : 0;
-  if (a == 1)      return ((x+1)/2);
-  if (a <= 7)      return mapes(x, a);
+  const UV* primes;
 
-  primea = primes ? primes[a] : _XS_prev_prime(primea);
+  if (a == 1)  return ((x+1)/2);
+  if (a <= 7)  return mapes(x, a);
 
-  h1 = heap_create(a * 1000);
-  h2 = heap_create(a * 1000);
+  primes = generate_small_primes(a+1);
+  if (primes == 0)
+    croak("Could not generate primes for phi(%lu,%lu)\n", x, a);
+  if (x < primes[a+1])  { Safefree(primes); return (x > 0) ? 1 : 0; }
 
+  /* This is a hack, trying to guess at a value where the array gets dense */
+  smallv = a * 1000;
+  if (smallv > x / primes[a] / 5)
+    smallv = x / primes[a] / 5;
+
+  h1 = heap_create(smallv);
+  h2 = heap_create(smallv);
   heap_insert(&h1, x, 1);
 
   while (a > 7) {
+    UV primea = primes[a];
     if (heap_not_empty(h2)) croak("h2 heap isn't empty.");
     while ( heap_not_empty(h1) ) {
       UV sval;
@@ -303,8 +430,8 @@ static UV phi(UV x, UV a, UV const* const primes)
       }
     }
     { heap_t t = h1; h1 = h2; h2 = t; }
+    /* printf("a = %lu  heap %lu/%lu + %lu\n", a, h1.small_N, h1.small_limit, h1.N); */
     a--;
-    primea = primes ? primes[a] : _XS_prev_prime(primea);
   }
   heap_destroy(&h2);
   if (a != 7) croak("final loop is set for a=7, a = %lu\n", a);
@@ -314,110 +441,98 @@ static UV phi(UV x, UV a, UV const* const primes)
       sum += count * mapes7(val);
   }
   heap_destroy(&h1);
+  Safefree(primes);
   return (UV) sum;
 }
 
 
-static UV* generate_small_primes(UV *n)
+/* Legendre's method.  Interesting and a good test for phi(x,a), but Lehmer's
+ * method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */
+UV _XS_legendre_pi(UV n)
 {
-  const unsigned char* sieve;
-  UV* primea;
-  UV  nth_prime;
-  UV  i;
-
-  /* Dusart 1999 bound */
-  nth_prime = (*n <= 10) ? 29 : *n * ( log(*n) + log(log(*n)) ) + 1;
+  UV a;
+  if (n < SIEVE_LIMIT)
+    return _XS_prime_count(2, n);
 
-  if (get_prime_cache(nth_prime, &sieve) < nth_prime) {
-    release_prime_cache(sieve);
-    croak("Could not generate sieve for %"UVuf, nth_prime);
-  }
-  New(0, primea, *n+4, UV);
-  if (primea == 0)
-    croak("Can not allocate small primes\n");
-  primea[0] = 0; primea[1] = 2; primea[2] = 3; primea[3] = 5;
-  i = 3;
-  START_DO_FOR_EACH_SIEVE_PRIME( sieve, 7, nth_prime ) {
-    if (i >= *n) break;
-    primea[++i] = p;
-  } END_DO_FOR_EACH_SIEVE_PRIME
-  release_prime_cache(sieve);
-  if (i < *n)
-    croak("Did not generate enough small primes.\n");
-  if (verbose > 1) printf("generated %lu small primes, from 2 to %lu\n", i, primea[i]);
-  return primea;
-}
+  a = _XS_legendre_pi(sqrt(n)+0.5);
 
-/* Given an array of primes[1..lastprime], return Pi(n) where n <= lastprime.
- * This is actually quite fast, and definitely faster than sieving.  By using
- * this we can avoid caching prime counts and also skip most calls to the
- * segment siever.
- */
-static UV bs_prime_count(UV n, UV const* const primes, UV lastprime)
-{
-  UV i, j;
-  if (n < 2)  return 0;
-  /* if (n > primes[lastprime])  return _XS_prime_count(2, n); */
-  if (n > primes[lastprime])
-    croak("called bspc(%lu) with counts up to %lu\n", n, primes[lastprime]);
-  i = 1;
-  j = lastprime;
-  while (i < j) {
-    UV mid = (i+j)/2;
-    if (primes[mid] <= n)  i = mid+1;
-    else                   j = mid;
-  }
-  return i-1;
+  return phi(n, a) + a - 1;
 }
 
 
-/* Legendre's method.  Interesting and a good test for phi(x,a), but Lehmer's
- * method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */
-UV _XS_legendre_pi(UV n)
+/* Meissel's method. */
+UV _XS_meissel_pi(UV n)
 {
-  UV a;
-  if (n < 30000)
+  UV a, b, c, sum, i, lastprime, lastpc, lastw, lastwpc;
+  const UV* primes = 0;  /* small prime cache */
+  DECLARE_TIMING_VARIABLES;
+  if (n < SIEVE_LIMIT)
     return _XS_prime_count(2, n);
 
-  a = _XS_legendre_pi(sqrt(n));
-  prime_precalc(a);
-  return phi(n, a, 0) + a - 1;
+  if (verbose > 0) printf("meissel %lu stage 1: calculate a,b,c \n", n);
+  TIMING_START;
+  a = _XS_meissel_pi(pow(n, 1.0/3.0)+0.5);     /* a = floor(n^1/3) */
+  b = _XS_meissel_pi(sqrt(n)+0.5);             /* b = floor(n^1/2) */
+  c = a;                                       /* c = a            */
+  TIMING_END_PRINT("stage 1")
+
+  if (verbose > 0) printf("meissel %lu stage 2: phi(x,a) (a=%lu b=%lu c=%lu)\n", n, a, b, c);
+  TIMING_START;
+  sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2);
+  if (verbose > 0) printf("phi(%lu,%lu) = %lu.  sum = %lu\n", n, a, sum - ((b+a-2) * (b-a+1) / 2), sum);
+  TIMING_END_PRINT("phi(x,a)")
+
+  lastprime = b*16;
+  if (verbose > 0) printf("meissel %lu stage 3: %lu small primes\n", n, lastprime);
+  TIMING_START;
+  primes = generate_small_primes(lastprime);
+  if (primes == 0) croak("Error generating primes.\n");
+  lastpc = primes[lastprime];
+  TIMING_END_PRINT("small primes")
+
+  prime_precalc( sqrt( n / primes[a+1] ) );
+
+  if (verbose > 0) printf("meissel %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]);
+  TIMING_START;
+  /* Reverse the i loop so w increases.  Count w in segments. */
+  lastw = 0;
+  lastwpc = 0;
+  for (i = b; i > a; i--) {
+    UV w = n / primes[i];
+    lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime)
+                            : lastwpc + _XS_prime_count(lastw+1, w);
+    lastw = w;
+    sum = sum - lastwpc;
+  }
+  TIMING_END_PRINT("stage 4")
+  Safefree(primes);
+  return sum;
 }
 
-/* Lehmer's method.  This is basically Riesel's Lehmer function (page 22),
- * with some additional code to help optimize it.
- */
 
+/* Lehmer's method.  This is basically Riesel's Lehmer function (page 22),
+ * with some additional code to help optimize it.  */
 UV _XS_lehmer_pi(UV n)
 {
-  UV z, a, b, c, sum, i, j, lastprimea, lastpc, lastw, lastwpc;
-  UV* primea = 0;   /* small prime cache, first b=pi(z)=pi(sqrt(n)) primes */
+  UV z, a, b, c, sum, i, j, lastprime, lastpc, lastw, lastwpc;
+  const UV* primes = 0; /* small prime cache, first b=pi(z)=pi(sqrt(n)) */
   DECLARE_TIMING_VARIABLES;
 
-  if (n < 1000000)
+  if (n < SIEVE_LIMIT)
     return _XS_prime_count(2, n);
 
   if (verbose > 0) printf("lehmer %lu stage 1: calculate a,b,c \n", n);
   TIMING_START;
   z = sqrt((double)n+0.5);
-  a = _XS_lehmer_pi(sqrt((double)z)+0.5);
-  b = _XS_lehmer_pi(z);
-  c = _XS_lehmer_pi(pow((double)n, 1.0/3.0)+0.5);
+  a = _XS_lehmer_pi(sqrt((double)z)+0.5);          /* a = floor(n^1/4) */
+  b = _XS_lehmer_pi(z);                            /* b = floor(n^1/2) */
+  c = _XS_lehmer_pi(pow((double)n, 1.0/3.0)+0.5);  /* c = floor(n^1/3) */
   TIMING_END_PRINT("stage 1")
 
 
-  /* We're going to sieve for small primes twice, in the interest of saving
-   * memory.  For phi(x,a), get just as many as are needed (a+1) because it
-   * is our memory bottleneck.  Then sieve for more afterwards. */
   if (verbose > 0) printf("lehmer %lu stage 2: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
   TIMING_START;
-  lastprimea = a+1;
-  primea = generate_small_primes(&lastprimea);
-  if (primea == 0 || lastprimea < a+1) croak("Error generating primes.\n");
-
-  sum = phi(n, a, primea) + ( (b+a-2) * (b-a+1) / 2);
-
-  Safefree(primea);
+  sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2);
   TIMING_END_PRINT("phi(x,a)")
 
 
@@ -425,38 +540,72 @@ UV _XS_lehmer_pi(UV n)
    * fast prime counts.  Using a higher value here will mean more memory but
    * faster operation.  A lower value saves memory at the expense of more
    * segment sieving.*/
-  lastprimea = b*16;
-  if (verbose > 0) printf("lehmer %lu stage 3: %lu small primes\n", n, lastprimea);
+  lastprime = b*16;
+  if (verbose > 0) printf("lehmer %lu stage 3: %lu small primes\n", n, lastprime);
   TIMING_START;
-  primea = generate_small_primes(&lastprimea);
-  if (primea == 0 || lastprimea < b) croak("Error generating primes.\n");
-  lastpc = primea[lastprimea];
+  primes = generate_small_primes(lastprime);
+  if (primes == 0) croak("Error generating primes.\n");
+  lastpc = primes[lastprime];
   TIMING_END_PRINT("small primes")
 
 
-  /* Ensure we have the base sieve for big prime_count ( n/primea[i] ). */
+  /* Ensure we have the base sieve for big prime_count ( n/primes[i] ). */
   /* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */
-  prime_precalc( sqrt( n / primea[a+1] ) );
+  prime_precalc( sqrt( n / primes[a+1] ) );
 
-  if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primea[a+1]);
+  if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]);
   TIMING_START;
   /* Reverse the i loop so w increases.  Count w in segments. */
   lastw = 0;
   lastwpc = 0;
   for (i = b; i >= a+1; i--) {
-    UV w = n / primea[i];
-    lastwpc = (w <= lastpc) ? bs_prime_count(w, primea, lastprimea)
+    UV w = n / primes[i];
+    lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime)
                             : lastwpc + _XS_prime_count(lastw+1, w);
     lastw = w;
     sum = sum - lastwpc;
     if (i <= c) {
-      UV bi = bs_prime_count( (UV) (sqrt(w) + 0.5), primea, lastprimea );
+      UV bi = bs_prime_count( (UV) (sqrt(w) + 0.5), primes, lastprime );
       for (j = i; j <= bi; j++) {
-        sum = sum - bs_prime_count(w / primea[j], primea, lastprimea) + j - 1;
+        sum = sum - bs_prime_count(w / primes[j], primes, lastprime) + j - 1;
       }
     }
   }
   TIMING_END_PRINT("stage 4")
-  Safefree(primea);
+  Safefree(primes);
   return sum;
 }
+
+
+#ifdef PRIMESIEVE_STANDALONE
+int main(int argc, char *argv[])
+{
+  UV n, pi;
+  double t;
+  const char* method;
+  struct timeval t0, t1;
+
+  if (argc <= 1) { printf("usage: %s  <n>  [<method>]\n", argv[0]); return(1); }
+  n = atol(argv[1]);
+  if (n < 2) { printf("Pi(%lu) = 0\n", n); return(0); }
+
+  if (argc > 2)
+    method = argv[2];
+  else
+    method = "lehmer";
+
+  gettimeofday(&t0, 0);
+  if      (!strcasecmp(method, "lehmer"))   { pi = _XS_lehmer_pi(n);      }
+  else if (!strcasecmp(method, "meissel"))  { pi = _XS_meissel_pi(n);     }
+  else if (!strcasecmp(method, "legendre")) { pi = _XS_legendre_pi(n);    }
+  else if (!strcasecmp(method, "sieve"))    { pi = _XS_prime_count(2, n); }
+  else {
+    printf("method must be one of: lehmer, meissel, legendre, or sieve\n");
+    return(2);
+  }
+  gettimeofday(&t1, 0);
+  t = (t1.tv_sec-t0.tv_sec);  t *= 1000000.0;  t += (t1.tv_usec - t0.tv_usec);
+  printf("%8s Pi(%lu) = %lu   in %10.5fs\n", method, n, pi, t / 1000000.0);
+  return(0);
+}
+#endif
diff --git a/lehmer.h b/lehmer.h
index 410feb0..e071deb 100644
--- a/lehmer.h
+++ b/lehmer.h
@@ -4,6 +4,7 @@
 #include "ptypes.h"
 
 extern UV _XS_legendre_pi(UV n);
+extern UV _XS_meissel_pi(UV n);
 extern UV _XS_lehmer_pi(UV n);
 
 #endif
diff --git a/t/13-primecount.t b/t/13-primecount.t
index 23765db..dc1c53a 100644
--- a/t/13-primecount.t
+++ b/t/13-primecount.t
@@ -81,7 +81,8 @@ plan tests => 0 + 1
                 + 3*scalar(keys %pivals32)
                 + scalar(keys %pivals_small)
                 + $use64 * 3 * scalar(keys %pivals64)
-                + scalar(keys %intervals);
+                + scalar(keys %intervals)
+                + 1;
 
 ok( eval { prime_count(13); 1; }, "prime_count in void context");
 
@@ -113,6 +114,9 @@ while (my($range, $expect) = each (%intervals)) {
   is( prime_count($low,$high), $expect, "prime_count($range) = $expect");
 }
 
+# Defect found in prime binary search
+is( prime_count(130066574), 7381740, "prime_count(130066574) = 7381740");
+
 sub parse_range {
   my($range) = @_;
   my($low,$high);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git