[libmath-prime-util-perl] 18/20: In-place phi(x,a) merge

Thu May 21 18:47:32 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.23
in repository libmath-prime-util-perl.

commit 0901cc7d0f2c9aca31ebd7f6c34c30ef017b524d
Author: Dana Jacobsen <dana at acm.org>
Date:   Mon Mar 4 18:18:37 2013 -0800

    In-place phi(x,a) merge
---
 Changes  |   7 +++-
 TODO     |   4 +-
 lehmer.c | 125 ++++++++++++++++++++++++++++++++++++++-------------------------
 sieve.c  |  53 ++++++++++++++-------------
 4 files changed, 112 insertions(+), 77 deletions(-)

diff --git a/Changes b/Changes
index a0b69db..c067d11 100644
--- a/Changes
+++ b/Changes
@@ -23,13 +23,18 @@ Revision history for Perl extension Math::Prime::Util.
 
     - Start on Lagarias-Miller-Odlyzko prime count.
 
+    - A new data structure for the phi(x,a) function used by all the fast
+      prime count routines.  Quite a bit faster and most importantly, uses
+      half the memory of the old structure.
+
     - Performance:
        - Divisor sum with no sub is ~10x faster.
        - Speed up PP version of exp_mangoldt, create XS version.
        - Zeta much faster as mentioned above.
        - faster nth_prime as mentioned above.
        - AKS about 10% faster.
-       - New data structure for phi(x,a) means even faster prime_count.
+       - Unroll a little more in sieve inner loop.  A couple percent faster.
+       - Faster prime_count and nth_prime due to new phi(x,a) (about 1.25x).
 
 0.22 26 February 2013
 
diff --git a/TODO b/TODO
index 65c9f1a..9a6844d 100644
--- a/TODO
+++ b/TODO
@@ -48,6 +48,6 @@
   that (1) handles any low / high, and (2) segments.  Take segment_primes
   from XS.xs to do this.
 
-- Do in-place merge in new phi(x,a).
-
 - Implement S2 calculation for LMO prime count.
+
+- Balance memory used in phi with memory for binary search pc in lehmer.
diff --git a/lehmer.c b/lehmer.c
index 65a7bd2..cad15ea 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -31,14 +31,14 @@
  * with large values of x.
  *
  * Using my sieve code with everything running in serial, calculating pi(10^12)
- * is done undef 1 second on my computer.  pi(10^14) takes under 30 seconds,
+ * is done under 1 second on my computer.  pi(10^14) takes under 30 seconds,
  * pi(10^16) in under 20 minutes.  Compared with Thomas R. Nicely's pix4
- * program, his one is 4-6x faster and uses 2-3x less memory.  When compiled
+ * program, his one is 4-6x faster and uses 2-4x less memory.  When compiled
  * with parallel primesieve it is another 2x or more faster:
  *   pix4(10^16) takes 124 minutes, this code + primesieve takes 4 minutes.
  *
- * Timings with Perl + MPU.  Using the standalone program with primesieve
- * speeds up stage 4 a lot for large values.
+ * Timings with Perl + MPU with all-serial computation.  Using the standalone
+ * program with parallel primesieve speeds up stage 4 a lot for large values.
  *
  *    n     phi(x,a) mem/time  |  stage 4 mem/time  | total time
  *   10^17   3000MB    275.29  | 2300MB   9911.9    | 179m 37.5s
@@ -299,7 +299,14 @@ static void vcarray_destroy(vcarray_t* l)
   l->size = 0;
   l->n = 0;
 }
-static void vcarray_insert(vcarray_t* l, UV val, IV count)
+/* Insert a value/count pair.  We do this indirection because about 80% of
+ * the calls result in a merge with the previous entry. */
+#define vcarray_insert(arr, val, count) \
+  if (arr.n > 0 && arr.a[arr.n-1].v == val) \
+    arr.a[arr.n-1].c += count; \
+  else \
+    vcarray_insert_func(&arr, val, count);
+static void vcarray_insert_func(vcarray_t* l, UV val, IV count)
 {
   UV n = l->n;
   if (n > 0 && l->a[n-1].v <= val) {
@@ -328,51 +335,80 @@ static void vcarray_insert(vcarray_t* l, UV val, IV count)
   l->n++;
 }
 
-static vcarray_t vcarray_merge(vcarray_t* a, vcarray_t* b)
+/* Merge the two sorted lists A and B into A.  Each list has no duplicates,
+ * but they may have duplications between the two.  We're quite interested
+ * in saving memory, so first remove all the duplicates, then do an in-place
+ * merge. */
+static void vcarray_merge(vcarray_t* a, vcarray_t* b)
 {
-  UV ai, bi, an, bn, k, mn;
-  vcarray_t m = vcarray_create();
-
-  /* printf("going to merge %lu and %lu elements\n", a->n, b->n); */
-  an = a->n;  bn = b->n;  mn = an+bn;
-  New(0, m.a, mn, vc_t);
-  m.size = mn;
-  /* merge */
-  ai = 0;  bi = 0;
+  long ai, bi, bj, k, kn;
+  UV an = a->n;
+  UV bn = b->n;
   vc_t* aa = a->a;
   vc_t* ba = b->a;
-  for (k = 0; k < mn; k++) {
+
+  /* Merge anything in B that appears in A. */
+  for (ai = 0, bi = 0, bj = 0; bi < bn; bi++) {
+    /* Skip forward in A until empty or aa[ai].v <= ba[bi].v */
+    UV bval = ba[bi].v;
+    while (ai < an && aa[ai].v > bval)
+      ai++;
+    /* if A empty then copy the remaining elements */
     if (ai >= an) {
-      if (bi >= bn) croak("ran out of data during merge");
-      while (k < mn)
-        m.a[k++] = ba[bi++];
+      if (bi == bj)
+        bj = bn;
+      else
+        while (bi < bn)
+          ba[bj++] = ba[bi++];
       break;
-    } else if (bi >= bn) {
-      while (k < mn)
-        m.a[k++] = aa[ai++];
+    }
+    if (aa[ai].v == bval)
+      aa[ai].c += ba[bi].c;
+    else
+      ba[bj++] = ba[bi];
+  }
+  if (verbose>2) printf("  removed %lu duplicates from b\n", bn - bj);
+  bn = bj;
+
+  if (bn == 0) {  /* In case they were all duplicates */
+    b->n = 0;
+    return;
+  }
+
+  /* kn = the final merged size.  All duplicates are gone, so this is exact. */
+  kn = an+bn;
+  if (a->size < kn) {  /* Make A big enough to hold kn elements */
+    UV new_size = (UV) (1.2 * kn);
+    if (verbose>2) printf("REALLOCing list %p, new size %lu\n", a->a, new_size);
+    Renew( a->a, new_size, vc_t );
+    aa = a->a;  /* this could have been changed by the realloc */
+    a->size = new_size;
+  }
+
+  /* merge A and B.  Very simple using reverse merge. */
+  ai = an-1;
+  bi = bn-1;
+  for (k = kn-1; k >= 0; k--) {
+    if (ai < 0) { /* A is exhausted, just filling in B */
+      if (bi < 0) croak("ran out of data during merge");
+      aa[k] = ba[bi--];
+    } else if (bi < 0) { /* We've caught up with A */
       break;
-    } else if (aa[ai].v > ba[bi].v) {
-      m.a[k] = aa[ai++];
-    } else if (ba[bi].v > aa[ai].v) {
-      m.a[k] = ba[bi++];
+    } else if (aa[ai].v < ba[bi].v) {
+      aa[k] = aa[ai--];
     } else {
-      m.a[k] = aa[ai];
-      m.a[k].c += ba[bi].c;
-      ai++;  bi++;  mn--;
+      if (aa[ai].v == ba[bi].v) croak("deduplication error");
+      aa[k] = ba[bi--];
     }
   }
-  m.n = k;
-  /* printf("done with merge, have %lu elements\n", m.n); */
-  return m;
+  a->n = kn;    /* A now has this many items */
+  b->n = 0;     /* B is marked empty */
 }
 
 
 /*
  * The main phi(x,a) algorithm.  In this implementation, it takes under 10%
- * of the total time for the Lehmer algorithm, but it is the memory
- * constraining part.
- *
- * TODO: Expand a1, then merge a2 directly into it.  Should save memory.
+ * of the total time for the Lehmer algorithm, but is a big memory consumer.
  */
 
 static UV phi(UV x, UV a)
@@ -392,7 +428,7 @@ static UV phi(UV x, UV a)
 
   vcarray_t a1 = vcarray_create();
   vcarray_t a2 = vcarray_create();
-  vcarray_insert(&a1, x, 1);
+  vcarray_insert(a1, x, 1);
 
   while (a > 7) {
     UV primea = primes[a];
@@ -403,22 +439,13 @@ static UV phi(UV x, UV a)
         continue;
       sval = val / primea;
       if (sval >= primea) {
-        vcarray_insert(&a2, sval, -count);
+        vcarray_insert(a2, sval, -count);
       } else {
         sum -= count;
       }
     }
-    { /* Merge a1 and a2 into a3 */
-      vcarray_t m = vcarray_merge(&a1, &a2);
-      /* printf("a1 size %lu  a2 size %lu  m size %lu\n",a1.n,a2.n,m.n); */
-      /* Empty a2 */
-      a2.n = 0;
-      /* erase a1 and replace with m */
-      vcarray_destroy(&a1);
-      a1.a = m.a;
-      a1.n = m.n;
-      a1.size = m.size;
-    }
+    /* Merge a1 and a2 into a1.  a2 will be emptied. */
+    vcarray_merge(&a1, &a2);
     a--;
   }
   vcarray_destroy(&a2);
diff --git a/sieve.c b/sieve.c
index 16d22fc..618fd21 100644
--- a/sieve.c
+++ b/sieve.c
@@ -94,6 +94,30 @@ static const unsigned char presieve13[PRESIEVE_SIZE] =
   0x18,0x89,0x08,0x25,0x44,0x22,0x30,0x14,0xc3,0x88,0x86,0x40,0x1a,
   0x28,0x30,0x85,0x09,0x54,0x60,0x43,0x24,0x92,0x81,0x08,0x04,0x70};
 
+#define FIND_COMPOSITE_POS(i,j) \
+  { \
+    UV dlast = d; \
+    do { \
+      d += dinc; \
+      m += minc; \
+      if (m >= 30) { d++; m -= 30; } \
+    } while ( masktab30[m] == 0 ); \
+    wdinc[i] = d - dlast; \
+    wmask[j] = masktab30[m]; \
+  }
+#define FIND_COMPOSITE_POSITIONS(p) \
+  do { \
+    FIND_COMPOSITE_POS(0,1) \
+    FIND_COMPOSITE_POS(1,2) \
+    FIND_COMPOSITE_POS(2,3) \
+    FIND_COMPOSITE_POS(3,4) \
+    FIND_COMPOSITE_POS(4,5) \
+    FIND_COMPOSITE_POS(5,6) \
+    FIND_COMPOSITE_POS(6,7) \
+    FIND_COMPOSITE_POS(7,0) \
+    d -= p; \
+  } while (0)
+
 static void sieve_prefill(unsigned char* mem, UV startd, UV endd)
 {
   UV nbytes = endd - startd + 1;
@@ -145,20 +169,9 @@ unsigned char* sieve_erat30(UV end)
     UV minc = (2*prime) - dinc*30;
     UV wdinc[8];
     unsigned char wmask[8];
-    int i;
 
     /* Find the positions of the next composites we will mark */
-    for (i = 1; i <= 8; i++) {
-      UV dlast = d;
-      do {
-        d += dinc;
-        m += minc;
-        if (m >= 30) { d++; m -= 30; }
-      } while ( masktab30[m] == 0 );
-      wdinc[i-1] = d - dlast;
-      wmask[i%8] = masktab30[m];
-    }
-    d -= prime;
+    FIND_COMPOSITE_POSITIONS(prime);
 #if 0
     assert(d == ((prime*prime)/30));
     assert(d < max_buf);
@@ -199,7 +212,8 @@ int sieve_segment(unsigned char* mem, UV startd, UV endd)
   /* Fill buffer with marked 7, 11, and 13 */
   sieve_prefill(mem, startd, endd);
 
-  limit = isqrt(endp) + 1;
+  limit = isqrt(endp);
+  if (limit*limit < endp) limit++;  /* ceil(sqrt(endp)) */
   /* printf("segment sieve from %"UVuf" to %"UVuf" (aux sieve to %"UVuf")\n", startp, endp, limit); */
   pcsize = get_prime_cache(limit, &sieve);
   if (pcsize < limit) {
@@ -236,20 +250,9 @@ int sieve_segment(unsigned char* mem, UV startd, UV endd)
       UV wdinc[8];
       unsigned char wmask[8];
       UV offset_endd = endd - startd;
-      int i;
 
       /* Find the positions of the next composites we will mark */
-      for (i = 1; i <= 8; i++) {
-        UV dlast = d;
-        do {
-          d += dinc;
-          m += minc;
-          if (m >= 30) { d++; m -= 30; }
-        } while ( masktab30[m] == 0 );
-        wdinc[i-1] = d - dlast;
-        wmask[i%8] = masktab30[m];
-      }
-      d -= p;
+      FIND_COMPOSITE_POSITIONS(p);
       d -= startd;
 #if 0
       i = 0;        /* Mark the composites */

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git