[libmath-prime-util-perl] 39/50: More Lehmer improvements (faster, lower memory)

Thu May 21 18:45:39 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.13
in repository libmath-prime-util-perl.

commit a7bc5c62a23097c8eaf8f663d958e89aa29a65ad
Author: Dana Jacobsen <dana at acm.org>
Date:   Fri Nov 16 23:04:13 2012 -0800

    More Lehmer improvements (faster, lower memory)
---
 lehmer.c | 292 +++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 154 insertions(+), 138 deletions(-)

diff --git a/lehmer.c b/lehmer.c
index 4977c0f..8d091d0 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -33,22 +33,21 @@
  * with large values of x.
  *
  * Calculating pi(10^11) is done in under 1 second on my computer.  pi(10^14)
- * takes about 1 minute, pi(10^16) in under an hour.  Compared to Thomas
- * R. Nicely's pix4 program, this one is about 3-4x faster and uses 3x less
- * memory.  However those comparisons were done without the big pre-computed
- * prime count tables pix4 can use to speed up the final stage.
+ * takes under 1 minute, pi(10^16) in a half hour.  Compared with Thomas
+ * R. Nicely's pix4 program, this one is 3-4x faster and uses 2-3x less memory.
  *
  *    n     phi(x,a) mem/time  |  stage 4 mem/time  | total time
- *   10^17   5988MB  1244.11   |   2688MB
- *   10^16   1877MB   254.83   |   945MB  1478.9    |  28m 51.4s
- *   10^15    754MB    50.50   |   392MB   224.2    |   4m 33.8s
- *   10^14    260MB     9.879  |   217MB    38.06   |    47.83s
- *   10^13              1.852  |   162MB     6.987  |     8.864s
- *   10^12              0.363  |             1.358  |     1.768s
- *   10^11              0.064  |             0.276  |     0.381s
- *   10^10  xxxxxMB     0.011  | xxxxxMB     0.056  |     0.105s
+ *   10^17   5094MB    870.07   |   2688MB 11328.6    | 203m 20.0s
+ *   10^16   1483MB    168.19   |   945MB   1489.2    |  27m 35.5s
+ *   10^15    448MB     31.26   |   392MB    226.1    |   4m 17.1s
+ *   10^14               5.519  |   217MB     38.36   |    43.76s
+ *   10^13               0.950  |   162MB      7.025  |     7.993s
+ *   10^12               0.174  |              1.363  |     1.574s
+ *   10^11               0.034  |              0.278  |     0.346s
+ *   10^10               0.007  |              0.058  |     0.103s
  *
- * Reference: "Prime numbers and computer methods for factorization", Riesel.
+ * Reference: Hans Riesel, "Prime Numbers and Computer Methods for
+ * Factorization", 2nd edition, 1994.
  */
 
 static int const verbose = 0;
@@ -70,8 +69,7 @@ static int const verbose = 0;
 
 /* Use Mapes' method to calculate phi(x,a) for small a.  This is really
  * convenient and a little Perl script will spit this code out for whatever
- * limit we select.  It gets unwieldy however, and using the primorial/totient
- * method looks faster for the final output.
+ * limit we select.  It gets unwieldy with large a values.
  */
 static IV mapes(IV x, UV a)
 {
@@ -87,197 +85,214 @@ static IV mapes(IV x, UV a)
   return val;
 }
 
+static IV mapes7(IV x) {    /* A tiny bit faster setup for a=7 */
+  IV val = x-x/2-x/3-x/5+x/6-x/7+x/10-x/11-x/13+x/14+x/15-x/17+x/21+x/22+x/26
+          -x/30+x/33+x/34+x/35+x/39-x/42+x/51+x/55+x/65-x/66-x/70+x/77-x/78
+          +x/85+x/91-x/102-x/105-x/110+x/119-x/130+x/143-x/154-x/165-x/170
+          -x/182+x/187-x/195+x/210+x/221-x/231-x/238-x/255-x/273-x/286+x/330
+          -x/357-x/374-x/385+x/390-x/429-x/442-x/455+x/462+x/510+x/546-x/561
+          -x/595-x/663+x/714;
+  if (x >= 715) {
+    val += -x/715+x/770+x/858+x/910-x/935-x/1001-x/1105+x/1122+x/1155+x/1190
+           -x/1309+x/1326+x/1365+x/1430-x/1547+x/1785+x/1870+x/2002+x/2145
+           +x/2210-x/2310-x/2431+x/2618-x/2730+x/2805+x/3003+x/3094+x/3315
+           -x/3570+x/3927-x/4290+x/4641+x/4862+x/5005-x/5610-x/6006+x/6545
+           -x/6630+x/7293+x/7735-x/7854;
+    if (x >= 9282)
+      val += -x/9282-x/10010+x/12155-x/13090-x/14586-x/15015-x/15470+x/17017
+             -x/19635-x/23205-x/24310+x/30030-x/34034-x/36465+x/39270+x/46410
+             -x/51051+x/72930-x/85085+x/102102+x/170170+x/255255-x/510510;
+  }
+  return val;
+}
+
 /******************************************************************************/
 /*   Modified heap for manipulating our UV value / IV count pairs             */
 /******************************************************************************/
 
-/* TODO: This should be cleaned up */
+/*
+ * This is a heap augmented with a small array.  We store values and signed
+ * counts, where all counts for the same value are summed.  An easy way to
+ * do this in Perl/Python is a hash.  In plain C, I don't believe this is the
+ * best solution.  A heap can be implemented both easily and very efficiently
+ * (using a linear array), and as we pull items off the heap we can combine
+ * all similar values.
+ *
+ * Below some threshold value ('small_limit') the items become dense.  That is,
+ * not only are the values small but we have many items in that range.  Hence
+ * the small array augmentation.  All values below the threshold are just put
+ * directly into an array.  This not only handles them a little faster but
+ * helps reduce the heap size a bit, as we don't put any repeated values in
+ * the heap for the small items.  Since they're dense in this range, we can
+ * do a linear scan to find the next non-zero count.
+ *
+ * An ideal data structure for our purpose would coalesce values on insertion,
+ * and would allow operating in place (so we could retrieve all our items and
+ * add new items as we go, without them appearing on this scan).  The former
+ * is possible using an ordered list or a balanced tree.  I don't know how we
+ * would achieve the latter.  The point being that we're pulling items off of
+ * h1 and adding it (plus possibly a new item) to h2, so ideally we'd manage
+ * to use that space freed up by h1.
+ */
 
-/* heap of values and counts, stored by value */
 typedef struct {
   UV v;
   IV c;
 } vc_t;
 
 typedef struct {
-  vc_t* array;
-  UV    array_size;
-  UV    N;
-  UV    small_limit;
-  IV*   small_array;
-  UV    Nsmall;
-  int   ptr_small;
+  UV    small_limit;    /* small count array: size */
+  UV    small_N;        /* small count array: number of non-zero elements */
+  int   small_ptr;      /* small count array: index of largest non-zero value */
+  UV    array_size;     /* heap: allocated size in elements */
+  UV    N;              /* heap: number of elements */
+  IV*   small_array;    /* small count array data */
+  vc_t* array;          /* heap data */
 } heap_t;
 
-static void heap_insert(heap_t* h, vc_t v)
+static void heap_insert(heap_t* h, UV val, IV count)
 {
-  UV k;
-  UV val = v.v;
-  if (v.c == 0)
-    return;
   if (val < h->small_limit) {
-    if (h->small_array[val] == 0) ++(h->Nsmall);
-    h->small_array[val] += v.c;
-    if (h->small_array[val] == 0) --(h->Nsmall);
-    else if (h->ptr_small < (int)val) h->ptr_small = (int) val;
-// printf("small array insert %lu count %ld, coalesce to %ld, Nsmall = %lu, ptr %d\n", v.v, v.c, h->small_array[val], h->Nsmall, h->ptr_small);
+    IV* saptr = h->small_array + val;
+    if (*saptr == 0)
+      ++(h->small_N);
+    *saptr += count;
+    if (*saptr == 0)
+      --(h->small_N);
+    else if (h->small_ptr < (int)val)
+      h->small_ptr = (int) val;
     return;
   }
   UV n = ++(h->N);
   vc_t* a = h->array;
   if (n >= h->array_size) {
-    UV new_size = (h->array_size == 0) ? 20000 : 1.5 * h->array_size;
-    if (verbose>2) printf("REALLOCing %p, new size %lu\n", h->array, new_size);
-    h->array = realloc( h->array, new_size * sizeof(vc_t) );
+    UV new_size = (h->array_size == 0)
+                  ? (h->small_limit <= (20000/2)) ? 20000 : 2*h->small_limit
+                  : 1.5 * h->array_size;
+    if (verbose>2) printf("REALLOCing heap %p, new size %lu\n", h->array, new_size);
+    h->array = Renew( h->array, new_size, vc_t );
     if (h->array == 0)
       croak("could not allocate heap\n");
     a = h->array;
     h->array_size = new_size-1;
     a[0].v = UV_MAX;  a[0].c = 0;
   }
-  a[n] = v;
-  k = n;
-  while (a[k/2].v <= v.v) {  /* upheap */
-    a[k] = a[k/2];
-    k /= 2;
-  }
-  a[k] = v;
-}
-
-static vc_t heap_remove(heap_t* h)
-{
-  UV k;
-  UV n = h->N;
-  vc_t* a = h->array;
-  vc_t v, top;
-  if (n == 0)
-    croak("removing from empty heap\n");
-  top = a[1];
-  v = a[1] = a[n];
-  n = --(h->N);                     /* downheap */
-  k = 1;
-  while (k <= n/2) {
-    UV j = k+k;
-    if (j < n && a[j].v < a[j+1].v)  j++;
-    if (v.v >= a[j].v) break;
-    a[k] = a[j];
-    k = j;
+  while (a[n/2].v <= val) {  /* upheap */
+    a[n] = a[n/2];
+    n /= 2;
   }
-  a[k] = v;
-  return top;
+  a[n].v = val;
+  a[n].c = count;
 }
-static vc_t heap_remove_coalesce(heap_t* h)
+static void heap_remove(heap_t* h, UV* val, IV* count)
 {
-  vc_t v;
   if (h->N == 0) {
-    int i;
-    if (h->Nsmall == 0) croak("remove from empty heap\n");
-    for (i = h->ptr_small; i >= 0; i--) {
-      if (h->small_array[i] != 0) {
-        v.v = (UV) i;
-        v.c = h->small_array[i];
-        h->small_array[i] = 0;
-        h->ptr_small = i-1;
-        --(h->Nsmall);
- //printf("small array returning %lu count %ld\n", v.v, v.c);
-        return v;
+    if (h->small_N == 0) croak("remove from empty heap\n");
+    /* Search small_array for a non-zero count from small_ptr down */
+    IV* saptr = h->small_array + h->small_ptr;
+    while (!*saptr)
+      saptr--;
+    if (saptr < h->small_array) croak("walked off small array\n");
+    *val = (saptr - h->small_array);
+    *count = *saptr;
+    *saptr = 0;
+    h->small_ptr = *val-1;
+    --(h->small_N);
+    return;
+  } else {
+    vc_t* a = h->array;
+    UV ival, k, n = h->N;
+    *val = a[1].v;
+    *count = 0;
+    do {
+      *count += a[1].c;
+      /* remove top element */
+      ival = a[n--].v;
+      k = 1;
+      while (k <= n/2) {
+        UV j = k+k;
+        if (j < n && a[j].v < a[j+1].v)  j++;
+        if (ival >= a[j].v) break;
+        a[k] = a[j];
+        k = j;
       }
-    }
-    croak("walked off small array\n");
-  }
-  v = heap_remove(h);
-  /* get rest of entries with same value */
-  while (h->N > 0 && h->array[1].v == v.v) {
-    v.c += h->array[1].c;
-    (void) heap_remove(h);
+      a[k] = a[n+1];
+    } while (n > 0 && a[1].v == *val);
+    h->N = n;
   }
-  return v;
 }
 static heap_t heap_create(UV small_size)
 {
   heap_t h;
   h.array = 0;
   h.array_size = 0;
-  h.N = 0;
   h.small_limit = small_size;
   h.small_array = 0;
   if (small_size > 0) {
     if (verbose>1)printf("creating small array of size %lu\n", small_size);
-    h.small_array = (IV*) calloc( small_size, sizeof(IV) );
+    Newz(0, h.small_array, small_size, IV);
   }
-  h.Nsmall = 0;
-  h.ptr_small = -1;
+  h.N = 0;
+  h.small_N = 0;
+  h.small_ptr = -1;
   return h;
 }
 static void heap_destroy(heap_t* h)
 {
   if (h->array != 0) {
-    if (verbose > 2) printf("FREE %p\n", h->array);
-    free(h->array);
+    if (verbose > 2) printf("FREE heap %p\n", h->array);
+    Safefree(h->array);
   }
   h->array = 0;
   h->array_size = 0;
   h->N = 0;
-  if (h->small_array != 0) {
-    free(h->small_array);
-  }
+  if (h->small_array != 0)
+    Safefree(h->small_array);
   h->small_array = 0;
-  h->Nsmall = 0;
-  h->ptr_small = -1;
-}
-static void heap_empty(heap_t* h)
-{
-  h->N = 0;
-  h->Nsmall = 0;
-  h->ptr_small = -1;
-  if (h->small_limit > 0)
-    memset(h->small_array, 0, h->small_limit * sizeof(IV));
+  h->small_N = 0;
+  h->small_ptr = -1;
 }
-
-#define heap_not_empty(h) ((h).N > 0 || (h).Nsmall > 0)
+#define heap_not_empty(h)  ((h).N > 0 || (h).small_N > 0)
 
 /******************************************************************************/
 
 
 
 /*
- * The main phi(x,a) algorithm.  In this implementation, it takes about 25%
- * of the total time for the Lehmer algorithm, but it is by far the most memory
- * consuming section.
+ * The main phi(x,a) algorithm.  In this implementation, it takes about 15%
+ * of the total time for the Lehmer algorithm, but it is by far the most
+ * memory consuming part.
  */
 
 static UV phi(UV x, UV a, UV* primes)
 {
-  UV i;
-  IV  sum = 0;
-  UV  primea = primes ? primes[a+1] : _XS_nth_prime(a+1);
+  UV i, val;
+  IV count, sum = 0;
+  UV primea = primes ? primes[a+1] : _XS_nth_prime(a+1);
   if (x < primea)  return (x > 0) ? 1 : 0;
   if (a == 1)      return ((x+1)/2);
   if (a <= 7)      return mapes(x, a);
 
   primea = primes ? primes[a] : _XS_prev_prime(primea);
 
-  heap_t h1 = heap_create(20 * x/primea/primea/primea);
-  heap_t h2 = heap_create(20 * x/primea/primea/primea);
-  vc_t v;
+  heap_t h1 = heap_create(a * 1000);
+  heap_t h2 = heap_create(a * 1000);
+
+  heap_insert(&h1, x, 1);
 
-  v.v = x;  v.c = 1;
-  heap_insert(&h1, v);
-  while (a > 6) {
-    heap_empty(&h2);
+  while (a > 7) {
+    if (heap_not_empty(h2)) croak("h2 heap isn't empty.");
     while ( heap_not_empty(h1) ) {
       UV sval;
-      v = heap_remove_coalesce(&h1);
-      if (v.c == 0)
+      heap_remove(&h1, &val, &count);
+      if (count == 0)
         continue;
-      heap_insert(&h2, v);
-      sval = v.v / primea;
+      heap_insert(&h2, val, count);
+      sval = val / primea;
       if (sval >= primea) {
-        v.v = sval;
-        v.c = -v.c;
-        heap_insert(&h2, v);
+        heap_insert(&h2, sval, -count);
       } else {
-        sum -= v.c;
+        sum -= count;
       }
     }
     { heap_t t = h1; h1 = h2; h2 = t; }
@@ -285,13 +300,14 @@ static UV phi(UV x, UV a, UV* primes)
     primea = primes ? primes[a] : _XS_prev_prime(primea);
   }
   heap_destroy(&h2);
+  if (a != 7) croak("final loop is set for a=7, a = %lu\n", a);
   while ( heap_not_empty(h1) ) {
-    v = heap_remove_coalesce(&h1);
-    if (v.c != 0)
-      sum += v.c * mapes(v.v, a);  /* This could be faster */
+    heap_remove(&h1, &val, &count);
+    if (count != 0)
+      sum += count * mapes7(val);
   }
   heap_destroy(&h1);
-  return sum;
+  return (UV) sum;
 }
 
 
@@ -335,7 +351,7 @@ UV _XS_lehmer_pi(UV n)
       release_prime_cache(sieve);
       croak("Could not generate sieve for %"UVuf, z);
     }
-    primea = (UV*) malloc( (b+4) * sizeof(UV) );
+    New(0, primea, b+4, UV);
     if (primea == 0)
       croak("Can not allocate small primes\n");
     primea[0] = 0;
@@ -364,7 +380,7 @@ UV _XS_lehmer_pi(UV n)
   TIMING_START;
   { /* Generate primecounts up to z used by stage 4. */
     UV count = 0;
-    pca = (UV*) malloc( (z+4) * sizeof(UV) );
+    New(0, pca, z+4, UV);
     if (pca == 0)
       croak("Can not allocate small prime counts\n");
     for (i = 0; i <= z && count+1 <= b; i++) {
@@ -409,8 +425,8 @@ printf("precalculated counts to %lu\n", z);
   }
   TIMING_END_PRINT("stage 4")
   if (pca != 0)
-    free(pca);
+    Safefree(pca);
   if (primea != 0)
-    free(primea);
+    Safefree(primea);
   return sum;
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git