[libmath-prime-util-perl] 38/50: Comments and a small speedup for Lehmer

Thu May 21 18:45:38 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.13
in repository libmath-prime-util-perl.

commit 3b884098fe37e7793d923f0975ebd9918df4961a
Author: Dana Jacobsen <dana at acm.org>
Date:   Fri Nov 16 03:49:37 2012 -0800

    Comments and a small speedup for Lehmer
---
 lehmer.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 184 insertions(+), 26 deletions(-)

diff --git a/lehmer.c b/lehmer.c
index 5c82ba1..4977c0f 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -8,8 +8,71 @@
 #include "cache.h"
 #include "sieve.h"
 
+/*****************************************************************************
+ *
+ * Lehmer prime counting utility.  Calculates pi(x), count of primes <= x.
+ *
+ * Copyright (c) 2012 Dana Jacobsen (dana at acm.org).
+ * This is free software; you can redistribute it and/or modify it under
+ * the same terms as the Perl 5 programming language system itself.
+ *
+ * This file is part of the Math::Prime::Util Perl module.  It relies on two
+ * features: (1) a sieve to generate very small primes which could be as
+ * simple as a three-line SoE, and (2) _XS_prime_count(low, high) is expected
+ * to return the prime count within the segment low to high inclusive.  These
+ * are used for the relatively small segments in the final stage, but making
+ * it fast is important for the final performance.  The primesieve package is
+ * one source of excellent routines for either task.
+ *
+ * The phi(x,a) calculation is unique, to the best of my knowledge.  It keeps
+ * a heap of all x values + signed counts for the given 'a' value, and walks
+ * 'a' down until it is small enough to calculate directly (either with Mapes
+ * or using a calculated table using the primorial/totient method).  This
+ * is relatively fast and low memory compared to many other solutions.  As with
+ * all Lehmer-Meissel-Legendre algorithms, memory use will be a constraint
+ * with large values of x.
+ *
+ * Calculating pi(10^11) is done in under 1 second on my computer.  pi(10^14)
+ * takes about 1 minute, pi(10^16) in under an hour.  Compared to Thomas
+ * R. Nicely's pix4 program, this one is about 3-4x faster and uses 3x less
+ * memory.  However those comparisons were done without the big pre-computed
+ * prime count tables pix4 can use to speed up the final stage.
+ *
+ *    n     phi(x,a) mem/time  |  stage 4 mem/time  | total time
+ *   10^17   5988MB  1244.11   |   2688MB
+ *   10^16   1877MB   254.83   |   945MB  1478.9    |  28m 51.4s
+ *   10^15    754MB    50.50   |   392MB   224.2    |   4m 33.8s
+ *   10^14    260MB     9.879  |   217MB    38.06   |    47.83s
+ *   10^13              1.852  |   162MB     6.987  |     8.864s
+ *   10^12              0.363  |             1.358  |     1.768s
+ *   10^11              0.064  |             0.276  |     0.381s
+ *   10^10  xxxxxMB     0.011  | xxxxxMB     0.056  |     0.105s
+ *
+ * Reference: "Prime numbers and computer methods for factorization", Riesel.
+ */
+
 static int const verbose = 0;
+#define DO_TIMING 0
+#ifdef DO_TIMING
+ #include <sys/time.h>
+ #define DECLARE_TIMING_VARIABLES  struct timeval t0, t1;
+ #define TIMING_START   gettimeofday(&t0, 0);
+ #define TIMING_END_PRINT(text) \
+  { unsigned long long t; \
+    gettimeofday(&t1, 0); \
+    t = (t1.tv_sec-t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec); \
+    printf("%s: %10.5f\n", text, ((double)t) / 1000000); }
+#else
+ #define DECLARE_TIMING_VARIABLES
+ #define TIMING_START
+ #define TIMING_END_PRINT(text)
+#endif
 
+/* Use Mapes' method to calculate phi(x,a) for small a.  This is really
+ * convenient and a little Perl script will spit this code out for whatever
+ * limit we select.  It gets unwieldy however, and using the primorial/totient
+ * method looks faster for the final output.
+ */
 static IV mapes(IV x, UV a)
 {
   IV val;
@@ -24,6 +87,12 @@ static IV mapes(IV x, UV a)
   return val;
 }
 
+/******************************************************************************/
+/*   Modified heap for manipulating our UV value / IV count pairs             */
+/******************************************************************************/
+
+/* TODO: This should be cleaned up */
+
 /* heap of values and counts, stored by value */
 typedef struct {
   UV v;
@@ -34,15 +103,30 @@ typedef struct {
   vc_t* array;
   UV    array_size;
   UV    N;
+  UV    small_limit;
+  IV*   small_array;
+  UV    Nsmall;
+  int   ptr_small;
 } heap_t;
 
 static void heap_insert(heap_t* h, vc_t v)
 {
   UV k;
+  UV val = v.v;
+  if (v.c == 0)
+    return;
+  if (val < h->small_limit) {
+    if (h->small_array[val] == 0) ++(h->Nsmall);
+    h->small_array[val] += v.c;
+    if (h->small_array[val] == 0) --(h->Nsmall);
+    else if (h->ptr_small < (int)val) h->ptr_small = (int) val;
+// printf("small array insert %lu count %ld, coalesce to %ld, Nsmall = %lu, ptr %d\n", v.v, v.c, h->small_array[val], h->Nsmall, h->ptr_small);
+    return;
+  }
   UV n = ++(h->N);
   vc_t* a = h->array;
   if (n >= h->array_size) {
-    UV new_size = (h->array_size == 0) ? 20000 : 2 * h->array_size;
+    UV new_size = (h->array_size == 0) ? 20000 : 1.5 * h->array_size;
     if (verbose>2) printf("REALLOCing %p, new size %lu\n", h->array, new_size);
     h->array = realloc( h->array, new_size * sizeof(vc_t) );
     if (h->array == 0)
@@ -84,7 +168,24 @@ static vc_t heap_remove(heap_t* h)
 }
 static vc_t heap_remove_coalesce(heap_t* h)
 {
-  vc_t v = heap_remove(h);
+  vc_t v;
+  if (h->N == 0) {
+    int i;
+    if (h->Nsmall == 0) croak("remove from empty heap\n");
+    for (i = h->ptr_small; i >= 0; i--) {
+      if (h->small_array[i] != 0) {
+        v.v = (UV) i;
+        v.c = h->small_array[i];
+        h->small_array[i] = 0;
+        h->ptr_small = i-1;
+        --(h->Nsmall);
+ //printf("small array returning %lu count %ld\n", v.v, v.c);
+        return v;
+      }
+    }
+    croak("walked off small array\n");
+  }
+  v = heap_remove(h);
   /* get rest of entries with same value */
   while (h->N > 0 && h->array[1].v == v.v) {
     v.c += h->array[1].c;
@@ -92,12 +193,20 @@ static vc_t heap_remove_coalesce(heap_t* h)
   }
   return v;
 }
-static heap_t heap_create(void)
+static heap_t heap_create(UV small_size)
 {
   heap_t h;
   h.array = 0;
   h.array_size = 0;
   h.N = 0;
+  h.small_limit = small_size;
+  h.small_array = 0;
+  if (small_size > 0) {
+    if (verbose>1)printf("creating small array of size %lu\n", small_size);
+    h.small_array = (IV*) calloc( small_size, sizeof(IV) );
+  }
+  h.Nsmall = 0;
+  h.ptr_small = -1;
   return h;
 }
 static void heap_destroy(heap_t* h)
@@ -109,34 +218,54 @@ static void heap_destroy(heap_t* h)
   h->array = 0;
   h->array_size = 0;
   h->N = 0;
+  if (h->small_array != 0) {
+    free(h->small_array);
+  }
+  h->small_array = 0;
+  h->Nsmall = 0;
+  h->ptr_small = -1;
 }
 static void heap_empty(heap_t* h)
 {
   h->N = 0;
+  h->Nsmall = 0;
+  h->ptr_small = -1;
+  if (h->small_limit > 0)
+    memset(h->small_array, 0, h->small_limit * sizeof(IV));
 }
 
+#define heap_not_empty(h) ((h).N > 0 || (h).Nsmall > 0)
+
+/******************************************************************************/
+
 
-static UV phi(UV x, UV a)
+
+/*
+ * The main phi(x,a) algorithm.  In this implementation, it takes about 25%
+ * of the total time for the Lehmer algorithm, but it is by far the most memory
+ * consuming section.
+ */
+
+static UV phi(UV x, UV a, UV* primes)
 {
   UV i;
-  IV  primea;
   IV  sum = 0;
-  if (x < _XS_nth_prime(a+1))  return (x > 0) ? 1 : 0;
-  if (a == 1) return ((x+1)/2);
-  if (a <= 7) return mapes(x, a);
+  UV  primea = primes ? primes[a+1] : _XS_nth_prime(a+1);
+  if (x < primea)  return (x > 0) ? 1 : 0;
+  if (a == 1)      return ((x+1)/2);
+  if (a <= 7)      return mapes(x, a);
 
-  primea = _XS_nth_prime(a);
+  primea = primes ? primes[a] : _XS_prev_prime(primea);
 
-  heap_t h1 = heap_create();
-  heap_t h2 = heap_create();
+  heap_t h1 = heap_create(20 * x/primea/primea/primea);
+  heap_t h2 = heap_create(20 * x/primea/primea/primea);
   vc_t v;
 
   v.v = x;  v.c = 1;
   heap_insert(&h1, v);
-  while (a > 5) {
+  while (a > 6) {
     heap_empty(&h2);
-    /* Walk heap */
-    while (h1.N > 0) {
+    while ( heap_not_empty(h1) ) {
       UV sval;
       v = heap_remove_coalesce(&h1);
       if (v.c == 0)
@@ -153,10 +282,10 @@ static UV phi(UV x, UV a)
     }
     { heap_t t = h1; h1 = h2; h2 = t; }
     a--;
-    primea = _XS_prev_prime(primea);
+    primea = primes ? primes[a] : _XS_prev_prime(primea);
   }
   heap_destroy(&h2);
-  while (h1.N > 0) {
+  while ( heap_not_empty(h1) ) {
     v = heap_remove_coalesce(&h1);
     if (v.c != 0)
       sum += v.c * mapes(v.v, a);  /* This could be faster */
@@ -165,6 +294,7 @@ static UV phi(UV x, UV a)
   return sum;
 }
 
+
 /* Legendre's method.  Interesting and a good test for phi(x,a), but Lehmer's
  * method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */
 UV _XS_legendre_pi(UV n)
@@ -174,29 +304,32 @@ UV _XS_legendre_pi(UV n)
 
   UV a = _XS_legendre_pi(sqrt(n));
   prime_precalc(a);
-  return phi(n, a) + a - 1;
+  return phi(n, a, 0) + a - 1;
 }
 
+/* Lehmer's method.  See Riesel for basic program. */
+
 UV _XS_lehmer_pi(UV n)
 {
+  DECLARE_TIMING_VARIABLES;
   UV z, a, b, c, sum, i, j, lastw, lastwpc;
   UV* primea = 0;   /* small prime cache, first b=pi(z)=pi(sqrt(n)) primes */
   UV* pca = 0;      /* small prime count cache, first z=sqrt(n) numbers */
   if (n < 1000000)
     return _XS_prime_count(2, n);
 
-  if (verbose > 0) printf("lehmer stage 1 - %lu\n", n);
+  if (verbose > 0) printf("lehmer %lu stage 1: calculate a,b,c \n", n);
+  TIMING_START;
   z = sqrt((double)n+0.5);
   a = _XS_lehmer_pi(sqrt((double)z)+0.5);
   b = _XS_lehmer_pi(z);
   c = _XS_lehmer_pi(pow((double)n, 1.0/3.0)+0.5);
-  if (verbose > 0) printf("lehmer stage 2 - %lu (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
-  sum = phi(n, a) + ( (b+a-2) * (b-a+1) / 2);
+  TIMING_END_PRINT("stage 1")
 
-  if (verbose > 0) printf("lehmer stage 3 - n=%lu a=%lu\n", n, a);
-  /* Calculate the first b primes and first z primecounts */
-  {
-    UV count = 0;
+
+  if (verbose > 0) printf("lehmer %lu stage 2: %lu small primes, %lu counts\n", n, b, z);
+  TIMING_START;
+  { /* Calculate the first b primes */
     const unsigned char* sieve;
     if (get_prime_cache(z, &sieve) < z) {
       release_prime_cache(sieve);
@@ -218,7 +351,19 @@ UV _XS_lehmer_pi(UV n)
     if (i < b)
       croak("Did not generate enough small primes.  Bug in Lehmer code.\n");
     if (verbose > 1) printf("generated first %lu small primes, from 2 to %lu\n", i, z);
+  }
+  TIMING_END_PRINT("small primes")
+
+
+  if (verbose > 0) printf("lehmer %lu stage 3: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
+  TIMING_START;
+  sum = phi(n, a, primea) + ( (b+a-2) * (b-a+1) / 2);
+  TIMING_END_PRINT("phi(x,a)")
 
+
+  TIMING_START;
+  { /* Generate primecounts up to z used by stage 4. */
+    UV count = 0;
     pca = (UV*) malloc( (z+4) * sizeof(UV) );
     if (pca == 0)
       croak("Can not allocate small prime counts\n");
@@ -229,12 +374,24 @@ UV _XS_lehmer_pi(UV n)
     }
     if (verbose > 1) printf("generated first %lu prime counts, from 2 to %lu\n", count, z);
   }
+ TIMING_END_PRINT("prime counts")
+printf("precalculated counts to %lu\n", z);
 
-  /* Ensure we have the base sieve for prime_count ( n/primea[i] ) */
+  /* Ensure we have the base sieve for prime_count ( n/primea[i] ). */
+  /* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */
   prime_precalc( sqrt( n / primea[a+1] ) );
 
+ /* TODO:
+  *  (1) generate 2b primes instead of b
+  *  (2) generate counts for all 2b primes
+  *  (3) store counts on odd numbers only.
+  *  (4) in loop below, check if we have the result in pca and use it.
+  *  This will cut in half the prime_count calls below, at a little memory
+  *  cost (almost entirely mitigated by change #3)
+  */
 
-  if (verbose > 0) printf("lehmer stage 4 - n=%lu a=%lu\n", n, a);
+  if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primea[a+1]);
+  TIMING_START;
   /* Reverse the i loop so w increases.  Count w in segments. */
   lastw = 0;
   lastwpc = 0;
@@ -250,6 +407,7 @@ UV _XS_lehmer_pi(UV n)
       }
     }
   }
+  TIMING_END_PRINT("stage 4")
   if (pca != 0)
     free(pca);
   if (primea != 0)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git