[libmath-prime-util-perl] 40/50: More Lehmer speedups and memory enhancements

Thu May 21 18:45:39 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.13
in repository libmath-prime-util-perl.

commit 3d48233bb9028f0c4c5e57f7227c5032d3e52c39
Author: Dana Jacobsen <dana at acm.org>
Date:   Sat Nov 17 09:08:14 2012 -0800

    More Lehmer speedups and memory enhancements
---
 lehmer.c | 204 ++++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 117 insertions(+), 87 deletions(-)

diff --git a/lehmer.c b/lehmer.c
index 8d091d0..8019f71 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -34,17 +34,17 @@
  *
  * Calculating pi(10^11) is done in under 1 second on my computer.  pi(10^14)
  * takes under 1 minute, pi(10^16) in a half hour.  Compared with Thomas
- * R. Nicely's pix4 program, this one is 3-4x faster and uses 2-3x less memory.
+ * R. Nicely's pix4 program, this one is 3-5x faster and uses 2-3x less memory.
  *
  *    n     phi(x,a) mem/time  |  stage 4 mem/time  | total time
- *   10^17   5094MB    870.07   |   2688MB 11328.6    | 203m 20.0s
- *   10^16   1483MB    168.19   |   945MB   1489.2    |  27m 35.5s
- *   10^15    448MB     31.26   |   392MB    226.1    |   4m 17.1s
- *   10^14               5.519  |   217MB     38.36   |    43.76s
- *   10^13               0.950  |   162MB      7.025  |     7.993s
- *   10^12               0.174  |              1.363  |     1.574s
- *   10^11               0.034  |              0.278  |     0.346s
- *   10^10               0.007  |              0.058  |     0.103s
+ *   10^17   4953MB    871.14   |   2988MB  9911.9    | 179m 37.5s
+ *   10^16   1436MB    168.02   |   901MB   1195.7    |  22m 45.6s
+ *   10^15    432MB     31.34   |   394MB    165.6    |   3m 17.5s
+ *   10^14    203MB      5.509  |   223MB     25.96   |    31.69s
+ *   10^13               0.949  |   165MB      4.284  |     5.336s
+ *   10^12               0.174  |              0.755  |     0.990s
+ *   10^11               0.034  |              0.138  |     0.213s
+ *   10^10               0.007  |              0.025  |     0.064s
  *
  * Reference: Hans Riesel, "Prime Numbers and Computer Methods for
  * Factorization", 2nd edition, 1994.
@@ -152,6 +152,8 @@ typedef struct {
 
 static void heap_insert(heap_t* h, UV val, IV count)
 {
+  UV n;
+  vc_t* a;
   if (val < h->small_limit) {
     IV* saptr = h->small_array + val;
     if (*saptr == 0)
@@ -163,16 +165,20 @@ static void heap_insert(heap_t* h, UV val, IV count)
       h->small_ptr = (int) val;
     return;
   }
-  UV n = ++(h->N);
-  vc_t* a = h->array;
+  n = ++(h->N);
+  a = h->array;
   if (n >= h->array_size) {
-    UV new_size = (h->array_size == 0)
-                  ? (h->small_limit <= (20000/2)) ? 20000 : 2*h->small_limit
-                  : 1.5 * h->array_size;
-    if (verbose>2) printf("REALLOCing heap %p, new size %lu\n", h->array, new_size);
-    h->array = Renew( h->array, new_size, vc_t );
-    if (h->array == 0)
-      croak("could not allocate heap\n");
+    UV new_size;
+    if (h->array_size == 0) {
+      new_size = (h->small_limit <= (20000/2)) ? 20000 : 2*h->small_limit;
+      if (verbose>2) printf("ALLOCing heap, size %lu\n", new_size);
+      New(0, h->array, new_size, vc_t);
+    } else {
+      new_size = 1.5 * h->array_size;
+      if (verbose>2) printf("REALLOCing heap %p, new size %lu\n", h->array, new_size);
+      Renew( h->array, new_size, vc_t );
+    }
+    if (h->array == 0) croak("could not allocate heap\n");
     a = h->array;
     h->array_size = new_size-1;
     a[0].v = UV_MAX;  a[0].c = 0;
@@ -187,9 +193,9 @@ static void heap_insert(heap_t* h, UV val, IV count)
 static void heap_remove(heap_t* h, UV* val, IV* count)
 {
   if (h->N == 0) {
-    if (h->small_N == 0) croak("remove from empty heap\n");
     /* Search small_array for a non-zero count from small_ptr down */
     IV* saptr = h->small_array + h->small_ptr;
+    if (h->small_N == 0) croak("remove from empty heap\n");
     while (!*saptr)
       saptr--;
     if (saptr < h->small_array) croak("walked off small array\n");
@@ -264,9 +270,10 @@ static void heap_destroy(heap_t* h)
  * memory consuming part.
  */
 
-static UV phi(UV x, UV a, UV* primes)
+static UV phi(UV x, UV a, UV const* const primes)
 {
-  UV i, val;
+  heap_t h1, h2;
+  UV val;
   IV count, sum = 0;
   UV primea = primes ? primes[a+1] : _XS_nth_prime(a+1);
   if (x < primea)  return (x > 0) ? 1 : 0;
@@ -275,8 +282,8 @@ static UV phi(UV x, UV a, UV* primes)
 
   primea = primes ? primes[a] : _XS_prev_prime(primea);
 
-  heap_t h1 = heap_create(a * 1000);
-  heap_t h2 = heap_create(a * 1000);
+  h1 = heap_create(a * 1000);
+  h2 = heap_create(a * 1000);
 
   heap_insert(&h1, x, 1);
 
@@ -311,26 +318,82 @@ static UV phi(UV x, UV a, UV* primes)
 }
 
 
+static UV* generate_small_primes(UV *n)
+{
+  const unsigned char* sieve;
+  UV* primea;
+  UV  nth_prime;
+  UV  i;
+
+  /* Dusart 1999 bound */
+  nth_prime = (*n <= 10) ? 29 : *n * ( log(*n) + log(log(*n)) ) + 1;
+
+  if (get_prime_cache(nth_prime, &sieve) < nth_prime) {
+    release_prime_cache(sieve);
+    croak("Could not generate sieve for %"UVuf, nth_prime);
+  }
+  New(0, primea, *n+4, UV);
+  if (primea == 0)
+    croak("Can not allocate small primes\n");
+  primea[0] = 0; primea[1] = 2; primea[2] = 3; primea[3] = 5;
+  i = 3;
+  START_DO_FOR_EACH_SIEVE_PRIME( sieve, 7, nth_prime ) {
+    if (i >= *n) break;
+    primea[++i] = p;
+  } END_DO_FOR_EACH_SIEVE_PRIME
+  release_prime_cache(sieve);
+  if (i < *n)
+    croak("Did not generate enough small primes.\n");
+  if (verbose > 1) printf("generated %lu small primes, from 2 to %lu\n", i, primea[i]);
+  return primea;
+}
+
+/* Given an array of primes[1..lastprime], return Pi(n) where n <= lastprime.
+ * This is actually quite fast, and definitely faster than sieving.  By using
+ * this we can avoid caching prime counts and also skip most calls to the
+ * segment siever.
+ */
+static UV bs_prime_count(UV n, UV const* const primes, UV lastprime)
+{
+  UV i, j;
+  if (n < 2)  return 0;
+  /* if (n > primes[lastprime])  return _XS_prime_count(2, n); */
+  if (n > primes[lastprime])
+    croak("called bspc(%lu) with counts up to %lu\n", n, primes[lastprime]);
+  i = 1;
+  j = lastprime;
+  while (i < j) {
+    UV mid = (i+j)/2;
+    if (primes[mid] <= n)  i = mid+1;
+    else                   j = mid;
+  }
+  return i-1;
+}
+
+
 /* Legendre's method.  Interesting and a good test for phi(x,a), but Lehmer's
  * method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */
 UV _XS_legendre_pi(UV n)
 {
+  UV a;
   if (n < 30000)
     return _XS_prime_count(2, n);
 
-  UV a = _XS_legendre_pi(sqrt(n));
+  a = _XS_legendre_pi(sqrt(n));
   prime_precalc(a);
   return phi(n, a, 0) + a - 1;
 }
 
-/* Lehmer's method.  See Riesel for basic program. */
+/* Lehmer's method.  This is basically Riesel's Lehmer function (page 22),
+ * with some additional code to help optimize it.
+ */
 
 UV _XS_lehmer_pi(UV n)
 {
-  DECLARE_TIMING_VARIABLES;
-  UV z, a, b, c, sum, i, j, lastw, lastwpc;
+  UV z, a, b, c, sum, i, j, lastprimea, lastpc, lastw, lastwpc;
   UV* primea = 0;   /* small prime cache, first b=pi(z)=pi(sqrt(n)) primes */
-  UV* pca = 0;      /* small prime count cache, first z=sqrt(n) numbers */
+  DECLARE_TIMING_VARIABLES;
+
   if (n < 1000000)
     return _XS_prime_count(2, n);
 
@@ -343,69 +406,38 @@ UV _XS_lehmer_pi(UV n)
   TIMING_END_PRINT("stage 1")
 
 
-  if (verbose > 0) printf("lehmer %lu stage 2: %lu small primes, %lu counts\n", n, b, z);
+  /* We're going to sieve for small primes twice, in the interest of saving
+   * memory.  For phi(x,a), get just as many as are needed (a+1) because it
+   * is our memory bottleneck.  Then sieve for more afterwards. */
+  if (verbose > 0) printf("lehmer %lu stage 2: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
   TIMING_START;
-  { /* Calculate the first b primes */
-    const unsigned char* sieve;
-    if (get_prime_cache(z, &sieve) < z) {
-      release_prime_cache(sieve);
-      croak("Could not generate sieve for %"UVuf, z);
-    }
-    New(0, primea, b+4, UV);
-    if (primea == 0)
-      croak("Can not allocate small primes\n");
-    primea[0] = 0;
-    primea[1] = 2;
-    primea[2] = 3;
-    primea[3] = 5;
-    i = 3;
-    START_DO_FOR_EACH_SIEVE_PRIME( sieve, 7, z ) {
-      if (i > b) break;
-      primea[++i] = p;
-    } END_DO_FOR_EACH_SIEVE_PRIME
-    release_prime_cache(sieve);
-    if (i < b)
-      croak("Did not generate enough small primes.  Bug in Lehmer code.\n");
-    if (verbose > 1) printf("generated first %lu small primes, from 2 to %lu\n", i, z);
-  }
-  TIMING_END_PRINT("small primes")
+  lastprimea = a+1;
+  primea = generate_small_primes(&lastprimea);
+  if (primea == 0 || lastprimea < a+1) croak("Error generating primes.\n");
 
-
-  if (verbose > 0) printf("lehmer %lu stage 3: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
-  TIMING_START;
   sum = phi(n, a, primea) + ( (b+a-2) * (b-a+1) / 2);
+
+  Safefree(primea);
   TIMING_END_PRINT("phi(x,a)")
 
 
+  /* Sieve to get small primes.  Get more than the minimum needed (b) to allow
+   * fast prime counts.  Using a higher value here will mean more memory but
+   * faster operation.  A lower value saves memory at the expense of more
+   * segment sieving.*/
+  lastprimea = b*16;
+  if (verbose > 0) printf("lehmer %lu stage 3: %lu small primes\n", n, lastprimea);
   TIMING_START;
-  { /* Generate primecounts up to z used by stage 4. */
-    UV count = 0;
-    New(0, pca, z+4, UV);
-    if (pca == 0)
-      croak("Can not allocate small prime counts\n");
-    for (i = 0; i <= z && count+1 <= b; i++) {
-      if (i >= primea[count+1])
-        count++;
-      pca[i] = count;
-    }
-    if (verbose > 1) printf("generated first %lu prime counts, from 2 to %lu\n", count, z);
-  }
- TIMING_END_PRINT("prime counts")
-printf("precalculated counts to %lu\n", z);
+  primea = generate_small_primes(&lastprimea);
+  if (primea == 0 || lastprimea < b) croak("Error generating primes.\n");
+  lastpc = primea[lastprimea];
+  TIMING_END_PRINT("small primes")
 
-  /* Ensure we have the base sieve for prime_count ( n/primea[i] ). */
+
+  /* Ensure we have the base sieve for big prime_count ( n/primea[i] ). */
   /* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */
   prime_precalc( sqrt( n / primea[a+1] ) );
 
- /* TODO:
-  *  (1) generate 2b primes instead of b
-  *  (2) generate counts for all 2b primes
-  *  (3) store counts on odd numbers only.
-  *  (4) in loop below, check if we have the result in pca and use it.
-  *  This will cut in half the prime_count calls below, at a little memory
-  *  cost (almost entirely mitigated by change #3)
-  */
-
   if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primea[a+1]);
   TIMING_START;
   /* Reverse the i loop so w increases.  Count w in segments. */
@@ -413,20 +445,18 @@ printf("precalculated counts to %lu\n", z);
   lastwpc = 0;
   for (i = b; i >= a+1; i--) {
     UV w = n / primea[i];
-    lastwpc += _XS_prime_count(lastw+1, w);
+    lastwpc = (w <= lastpc) ? bs_prime_count(w, primea, lastprimea)
+                            : lastwpc + _XS_prime_count(lastw+1, w);
     lastw = w;
     sum = sum - lastwpc;
     if (i <= c) {
-      UV bi = pca[(UV) (sqrt(w) + 0.5)];
+      UV bi = bs_prime_count( (UV) (sqrt(w) + 0.5), primea, lastprimea );
       for (j = i; j <= bi; j++) {
-        sum = sum - pca[w / primea[j]] + j - 1;
+        sum = sum - bs_prime_count(w / primea[j], primea, lastprimea) + j - 1;
       }
     }
   }
   TIMING_END_PRINT("stage 4")
-  if (pca != 0)
-    Safefree(pca);
-  if (primea != 0)
-    Safefree(primea);
+  Safefree(primea);
   return sum;
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git