[libmath-prime-util-perl] 70/72: Updated LMO and Lehmer internals, switch to LMO

Thu May 21 18:49:43 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.32
in repository libmath-prime-util-perl.

commit 7e72cddede463629b716a6b00a62ec7b16585625
Author: Dana Jacobsen <dana at acm.org>
Date:   Tue Oct 15 19:29:17 2013 -0700

    Updated LMO and Lehmer internals, switch to LMO
---
 Changes                      |  11 +-
 examples/bench-primecount.pl |  45 +++++---
 lehmer.c                     | 254 +++++++++++++++++++++++++------------------
 lehmer.h                     |   1 +
 lib/Math/Prime/Util.pm       |  18 +--
 t/13-primecount.t            |   5 +-
 t/14-nthprime.t              |   5 +-
 t/19-moebius.t               |   8 ++
 t/31-threading.t             |   2 +-
 util.c                       |   4 +-
 10 files changed, 215 insertions(+), 138 deletions(-)

diff --git a/Changes b/Changes
index c2e766d..65e2993 100644
--- a/Changes
+++ b/Changes
@@ -16,10 +16,13 @@ Revision history for Perl module Math::Prime::Util
     - Added Math::Prime::Util::PrimeIterator.  A more feature-rich iterator
       than the simple closure one from prime_iterator.  Experimental.
 
-    - Make LMO primecount work (albeit a simplistic version).  Memory
-      limiting features added to phi calculations, so the phi calculations
-      should stay under ~150MB even with giant sizes.
-      Thanks to Kim Walisch for all discussions about this.
+    - Make very simple LMO primecount work, and switch prime_count to use it.
+      It is slower for large inputs, but uses much less memory.  For smaller
+      inputs it it as fast or faster.  Lehmer code modified to constrain
+      memory use at the expense of speed (starts taking effect at ~ 10^16).
+      Thanks to Kim Walisch for discussions about this.  Note that this is
+      a very simple implementation -- better code could run 10x faster and
+      use even less memory.
 
     - divisor_sum can take an integer 'k' in the second argument to compute
       sigma_k.  This is much faster than using subs, especially when the
diff --git a/examples/bench-primecount.pl b/examples/bench-primecount.pl
index 3ec4dcd..7298816 100755
--- a/examples/bench-primecount.pl
+++ b/examples/bench-primecount.pl
@@ -15,28 +15,41 @@ my $sum;
 
 print "Direct sieving:\n";
 cmpthese($count,{
-    ' 2' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[2-2]} },
-    ' 3' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[3-2]} },
-    ' 4' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[4-2]} },
-    ' 5' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[5-2]} },
-    ' 6' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[6-2]} },
-    ' 7' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[7-2]} },
-    ' 8' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[8-2]} },
+    ' 2' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[2-2]} },
+    ' 3' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[3-2]} },
+    ' 4' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[4-2]} },
+    ' 5' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[5-2]} },
+    ' 6' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[6-2]} },
+    ' 7' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[7-2]} },
+    ' 8' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[8-2]} },
     #' 9' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[9-2]} },
     #'10' => sub { $sum += Math::Prime::Util::_XS_prime_count($_) for @{$darray[10-2]} },
 });
 print "\n";
 print "Direct Lehmer:\n";
 cmpthese($count,{
-    ' 2' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[2-2]} },
-    ' 3' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[3-2]} },
-    ' 4' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[4-2]} },
-    ' 5' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[5-2]} },
-    ' 6' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[6-2]} },
-    ' 7' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[7-2]} },
-    ' 8' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[8-2]} },
-    ' 9' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[9-2]} },
-    '10' => sub { $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[10-2]} },
+    ' 2' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[2-2]} },
+    ' 3' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[3-2]} },
+    ' 4' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[4-2]} },
+    ' 5' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[5-2]} },
+    ' 6' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[6-2]} },
+    ' 7' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[7-2]} },
+    ' 8' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[8-2]} },
+    ' 9' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[9-2]} },
+    '10' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_lehmer_pi($_) for @{$darray[10-2]} },
+});
+print "\n";
+print "Direct LMO:\n";
+cmpthese($count,{
+    ' 2' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[2-2]} },
+    ' 3' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[3-2]} },
+    ' 4' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[4-2]} },
+    ' 5' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[5-2]} },
+    ' 6' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[6-2]} },
+    ' 7' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[7-2]} },
+    ' 8' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[8-2]} },
+    ' 9' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[9-2]} },
+    '10' => sub { prime_memfree(); $sum += Math::Prime::Util::_XS_LMO_pi($_) for @{$darray[10-2]} },
 });
 print "\n";
 
diff --git a/lehmer.c b/lehmer.c
index 3cfa8a7..6aae90a 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -46,7 +46,7 @@
  * with parallel primesieve it is over 10x faster.
  *   pix4(10^16) takes 124 minutes, this code + primesieve takes < 4 minutes.
  *
- * Timings with Perl + MPU with all-serial computation.
+ * Timings with Perl + MPU with all-serial computation, no memory limit.
  * The last column is the standalone time with 12-core parallel primesieve.
  *
  *    n     phi(x,a) mem/time  |  stage 4 mem/time  | total time | pps time
@@ -274,17 +274,26 @@ static UV icbrt(UV n)
  * this we can avoid caching prime counts and also skip most calls to the
  * segment siever.
  */
-static UV bs_prime_count(UV n, UV const* const primes, UV lastprime)
+static UV bs_prime_count(UV n, UV const* const primes, UV lastidx)
 {
   UV i, j;
-  if (n < 2)  return 0;
-  /* if (n > primes[lastprime])  return _XS_prime_count(2, n); */
-  if (n >= primes[lastprime]) {
-    if (n == primes[lastprime]) return lastprime;
-    croak("called bspc(%lu) with counts up to %lu\n", n, primes[lastprime]);
+  if (n <= 2)  return (n == 2);
+  /* if (n > primes[lastidx])  return _XS_prime_count(2, n); */
+  if (n >= primes[lastidx]) {
+    if (n == primes[lastidx]) return lastidx;
+    croak("called bspc(%lu) with counts up to %lu\n", n, primes[lastidx]);
+  }
+  j = lastidx;
+  if (n < 8480) {
+    i = 1 + (n>>4);
+    if (j > 1060) j = 1060;
+  } else if (n < 25875000) {
+    i = 793 + (n>>5);
+    if (j > (n>>3)) j = n>>3;
+  } else {
+    i = 1617183;
+    if (j > (n>>4)) j = n>>4;
   }
-  i = 1;
-  j = lastprime;
   while (i < j) {
     UV mid = (i+j)/2;
     if (primes[mid] <= n)  i = mid+1;
@@ -357,70 +366,66 @@ static uint32_t mapes7_32(uint32_t x) {
   return val;
 }
 
-/* Max memory = 2*A*X bytes, e.g. 2*400*16000 = 12.2 MB */
+/* Max memory = 2*A*X bytes, e.g. 2*400*24000 = 18.3 MB */
 #define PHICACHEA 400
-#define PHICACHEX 16000
-static uint32_t _phicache_max[PHICACHEA] = {0};
-static int16_t* _phicache[PHICACHEA] = {0};
-static void phicache_free(void) {
+#define PHICACHEX 24000
+typedef struct
+{
+  uint32_t max[PHICACHEA];
+  int16_t* val[PHICACHEA];
+} cache_t;
+static void phicache_init(cache_t* cache) {
+  int a;
+  for (a = 0; a < PHICACHEA; a++) {
+    cache->val[a] = 0;
+    cache->max[a] = 0;
+  }
+}
+static void phicache_free(cache_t* cache) {
   int a;
-#ifdef _OPENMP
-    #pragma omp critical(phicache)
-#endif
   for (a = 0; a < PHICACHEA; a++) {
-    if (_phicache[a] != 0)
-      Safefree(_phicache[a]);
-    _phicache[a] = 0;
-    _phicache_max[a] = 0;
+    if (cache->val[a] != 0)
+      Safefree(cache->val[a]);
+    cache->val[a] = 0;
+    cache->max[a] = 0;
   }
 }
+
 #define PHI_CACHE_POPULATED(x, a) \
   ((a) < PHICACHEA && (x) < PHICACHEX && \
-  _phicache_max[a] > (x) && _phicache[a][x] != 0)
+  cache->max[a] > (x) && cache->val[a][x] != 0)
 
-static void phi_cache_insert(UV x, UV a, IV sum) {
-#ifdef _OPENMP
-    #pragma omp critical(phicache)
-#endif
+static void phi_cache_insert(UV x, UV a, IV sum, cache_t* cache) {
   if (a < PHICACHEA && x < PHICACHEX) {
     uint32_t cap = ((x+1+31)/32)*32;
-    if (_phicache[a] == 0) {
-      Newz(0, _phicache[a], cap, int16_t);
-      _phicache_max[a] = cap;
-    } else if (_phicache_max[a] < cap) {
+    if (cache->val[a] == 0) {
+      Newz(0, cache->val[a], cap, int16_t);
+      cache->max[a] = cap;
+    } else if (cache->max[a] < cap) {
       uint32_t i;
-      Renew(_phicache[a], cap, int16_t);
-      for (i = _phicache_max[a]; i < cap; i++)
-        _phicache[a][i] = 0;
-      _phicache_max[a] = cap;
+      Renew(cache->val[a], cap, int16_t);
+      for (i = cache->max[a]; i < cap; i++)
+        cache->val[a][i] = 0;
+      cache->max[a] = cap;
     }
     if (sum < SHRT_MIN || sum > SHRT_MAX)
       croak("phi(%lu,%lu) 16-bit overflow: sum = %ld\n", x, a, sum);
-    _phicache[a][x] = sum;
+    cache->val[a][x] = sum;
   }
 }
-static size_t phicache_size(void) {
-  size_t sum = sizeof(_phicache);
-  int a;
-#ifdef _OPENMP
-    #pragma omp critical(phicache)
-#endif
-  for (a = 0; a < PHICACHEA; a++)
-    if (_phicache[a] != 0)
-      sum += _phicache_max[a] * sizeof(_phicache[a][0]);
-  return sum;
-}
 
-static IV _phi3(UV x, UV a, int sign, const UV* primes)
+static IV _phi3(UV x, UV a, int sign, const UV* const primes, const UV lastidx, cache_t* cache)
 {
   IV sum;
 
   if (a < 3)
     return sign * ((a==0) ? x : (a==1) ? x-x/2 : x-x/2-x/3+x/6);
   else if (PHI_CACHE_POPULATED(x, a))
-    return sign * _phicache[a][x];
+    return sign * cache->val[a][x];
   else if (x < primes[a+1])
     sum = sign;
+  else if (1 && x <= primes[lastidx] && x < primes[a]*primes[a])
+    sum = sign * (bs_prime_count(x, primes, lastidx) - a + 1);
   else {
     UV a2;
     if (a*a > x) {
@@ -428,19 +433,25 @@ static IV _phi3(UV x, UV a, int sign, const UV* primes)
       UV iters = bs_prime_count(ix, primes, a+1);
       sum = sign * (x - a + iters);
       for (a2 = 1; a2 <= iters; a2++)
-        sum += _phi3( FAST_DIV(x, primes[a2]), a2-1, -sign, primes);
-    } else {
-      sum = 0;
+        sum += _phi3( FAST_DIV(x, primes[a2]), a2-1, -sign, primes, lastidx, cache);
+    } else if (a >= 7) {
+      if (PHI_CACHE_POPULATED(x, 7))
+        sum = sign * cache->val[7][x];
+      else
+        sum = sign * mapes7(x);
       for (a2 = 8; a2 <= a; a2++)
-        sum += _phi3( FAST_DIV(x,primes[a2]), a2-1, -sign, primes);
-      a2 = (a > 7) ? 7 : a;
-      sum += sign * (PHI_CACHE_POPULATED(x, a2) ? _phicache[a2][x] : mapes(x, a2));
+        sum += _phi3( FAST_DIV(x,primes[a2]), a2-1, -sign, primes, lastidx, cache);
+    } else {
+      if (PHI_CACHE_POPULATED(x, a))
+        sum = sign * cache->val[a][x];
+      else
+        sum = sign * mapes(x, a);
     }
   }
-  phi_cache_insert(x, a, sign * sum);
+  phi_cache_insert(x, a, sign * sum, cache);
   return sum;
 }
-#define phi_small(x, a, primes)  _phi3(x, a, 1, primes)
+#define phi_small(x, a, primes, lastidx, cache)  _phi3(x, a, 1, primes, lastidx, cache)
 
 /******************************************************************************/
 /*   In-order lists for manipulating our UV value / IV count pairs            */
@@ -594,73 +605,93 @@ static void vcarray_remove_zeros(vcarray_t* a)
  * The main phi(x,a) algorithm.  In this implementation, it takes under 10%
  * of the total time for the Lehmer algorithm, but is a big memory consumer.
  */
+#define NTHRESH (MAX_PHI_MEM/16)
 
 static UV phi(UV x, UV a)
 {
-  UV i, val, sval;
+  UV i, val, sval, lastidx, lastprime;
   UV sum = 0;
   IV count;
   const UV* primes;
   vcarray_t a1, a2;
   vc_t* arr;
+  cache_t pcache; /* Cache for recursive phi */
 
   if (a == 1)  return ((x+1)/2);
   if (a <= 7)  return mapes(x, a);
 
-  primes = generate_small_primes(a+1);
+  lastidx = a+1;
+  primes = generate_small_primes(lastidx);
   if (primes == 0)
     croak("Could not generate primes for phi(%lu,%lu)\n", x, a);
-  if (x < primes[a+1])  { Safefree(primes); return (x > 0) ? 1 : 0; }
+  lastprime = primes[lastidx];
+  if (x < lastprime)  { Safefree(primes); return (x > 0) ? 1 : 0; }
+  phicache_init(&pcache);
 
   a1 = vcarray_create();
   a2 = vcarray_create();
   vcarray_insert(&a1, x, 1);
 
-  while (a > 7 && a1.n < (MAX_PHI_MEM/16)) {
+  while (a > 7) {
     UV primea = primes[a];
     UV sval_last = 0;
     IV sval_count = 0;
     arr = a1.a;
     for (i = 0; i < a1.n; i++) {
       count = arr[i].c;
-      if (count == 0) continue;      /* Skip if count = 0 */
       val  = arr[i].v;
       sval = FAST_DIV(val, primea);
       if (sval < primea) break;      /* stop inserting into a2 if small */
       if (sval != sval_last) {       /* non-merged value.  Insert into a2 */
-        if (sval_last != 0)
-          vcarray_insert(&a2, sval_last, sval_count);
+        if (sval_last != 0) {
+          if (sval_last <= lastprime && sval_last < primes[a-1]*primes[a-1])
+            sum += sval_count*(bs_prime_count(sval_last,primes,lastidx)-a+2);
+          else
+            vcarray_insert(&a2, sval_last, sval_count);
+        }
         sval_last = sval;
         sval_count = 0;
       }
       sval_count -= count;           /* Accumulate count for this sval */
     }
-    if (sval_last != 0)              /* Insert the last sval */
-      vcarray_insert(&a2, sval_last, sval_count);
+    if (sval_last != 0) {            /* Insert the last sval */
+      if (sval_last <= lastprime && sval_last < primes[a-1]*primes[a-1])
+        sum += sval_count*(bs_prime_count(sval_last,primes,lastidx)-a+2);
+      else
+        vcarray_insert(&a2, sval_last, sval_count);
+    }
     /* For each small sval, add up the counts */
     for ( ; i < a1.n; i++)
       sum -= arr[i].c;
     /* Merge a1 and a2 into a1.  a2 will be emptied. */
     vcarray_merge(&a1, &a2);
+    /* If we've grown too large, use recursive phi to clip. */
+    if ( a1.n > NTHRESH ) {
+      arr = a1.a;
+      if (verbose > 0) printf("clipping small values at a=%lu a1.n=%lu \n", a, a1.n);
+#ifdef _OPENMP
+      #pragma omp parallel for reduction(+: sum) firstprivate(pcache) schedule(dynamic, 16)
+#endif
+      for (i = 0; i < a1.n-NTHRESH+NTHRESH/50; i++) {
+        UV j = a1.n - 1 - i;
+        IV count = arr[j].c;
+        if (count != 0) {
+          sum += count * phi_small( arr[j].v, a-1, primes, lastidx, &pcache );
+          arr[j].c = 0;
+        }
+      }
+    }
     vcarray_remove_zeros(&a1);
     a--;
   }
+  phicache_free(&pcache);
   vcarray_destroy(&a2);
   arr = a1.a;
-  if (a > 7) {
-    if (verbose > 0) printf("CUT TO SMALL PHI AT a = %lu\n", a);
-#ifdef _OPENMP
-    #pragma omp parallel for reduction(+: sum) num_threads(2) schedule(dynamic, 16)
-#endif
-    for (i = 0; i < a1.n; i++)
-      sum += arr[i].c * phi_small( arr[i].v, a, primes );
-  } else {
 #ifdef _OPENMP
-    #pragma omp parallel for reduction(+: sum) schedule(dynamic, 16)
+  #pragma omp parallel for reduction(+: sum) schedule(dynamic, 16)
 #endif
-    for (i = 0; i < a1.n; i++)
-      sum += arr[i].c * mapes7( arr[i].v );
-  }
+  for (i = 0; i < a1.n; i++)
+    sum += arr[i].c * mapes7( arr[i].v );
   vcarray_destroy(&a1);
   Safefree(primes);
   return (UV) sum;
@@ -668,15 +699,11 @@ static UV phi(UV x, UV a)
 
 
 extern UV _XS_meissel_pi(UV n);
-static UV Pk_2(UV n, UV a)
+/* b = prime_count(isqrt(n)) */
+static UV Pk_2_p(UV n, UV a, UV b, const UV* primes, UV lastprime)
 {
-  UV b, lastprime, lastpc, lastw, lastwpc, i, P2;
-  const UV* primes; /* small prime cache, first b=pi(z)=pi(sqrt(n)) */
-
-  b = _XS_meissel_pi(isqrt(n));        /* b = pi(floor(n^1/2)) */
-  lastprime = b*SIEVE_MULT+1;
-  primes = generate_small_primes(lastprime);
-  lastpc = primes[lastprime];
+  UV lastw, lastwpc, i, P2;
+  UV lastpc = primes[lastprime];
 
   /* Ensure we have a large enough base sieve */
   prime_precalc(isqrt(n / primes[a+1]));
@@ -690,6 +717,13 @@ static UV Pk_2(UV n, UV a)
     P2 += lastwpc;
   }
   P2 -= ((b+a-2) * (b-a+1) / 2) - a + 1;
+  return P2;
+}
+static UV Pk_2(UV n, UV a, UV b)
+{
+  UV lastprime = b*SIEVE_MULT+1;
+  const UV* primes = generate_small_primes(lastprime);
+  UV P2 = Pk_2_p(n, a, b, primes, lastprime);
   Safefree(primes);
   return P2;
 }
@@ -705,7 +739,6 @@ UV _XS_legendre_pi(UV n)
 
   a = _XS_legendre_pi(isqrt(n));
   phina = phi(n, a);
-  phicache_free();
   return phina + a - 1;
 }
 
@@ -713,14 +746,14 @@ UV _XS_legendre_pi(UV n)
 /* Meissel's method. */
 UV _XS_meissel_pi(UV n)
 {
-  UV a, sum;
+  UV a, b, sum;
   if (n < SIEVE_LIMIT)
     return _XS_prime_count(2, n);
 
   a = _XS_meissel_pi(icbrt(n));        /* a = floor(n^1/3) */
+  b = _XS_meissel_pi(isqrt(n));        /* b = floor(n^1/2) */
 
-  sum = phi(n, a) + a - 1 - Pk_2(n, a);
-  phicache_free();
+  sum = phi(n, a) + a - 1 - Pk_2(n, a, b);
   return sum;
 }
 
@@ -754,8 +787,6 @@ UV _XS_lehmer_pi(UV n)
   if (verbose > 0) printf("lehmer %lu stage 2: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
   TIMING_START;
   sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2);
-  if (verbose > 1) printf("phicache size: %lu\n", (unsigned long)phicache_size());
-  phicache_free();
   TIMING_END_PRINT("phi(x,a)")
 
   /* We get an array of the first b primes.  This is used in stage 4.  If we
@@ -802,16 +833,20 @@ UV _XS_lehmer_pi(UV n)
 }
 
 
-/* The Lagarias-Miller-Odlyzko method.  Very simple implementation without
- * many optimizations.  It is very nice for memory use, but it isn't
- * particularly fast. */
+/* The Lagarias-Miller-Odlyzko method.
+ * Naive implementation without optimizations.
+ * About the same speed as Lehmer, a bit less memory.
+ * A better implementation can be 10-50x faster and much less memory.
+ */
 UV _XS_LMO_pi(UV n)
 {
   UV n12, n13, a, b, sum, i, j, k, lastprime, P2, S1, S2;
   const UV* primes = 0;  /* small prime cache */
   signed char* mu = 0;   /* moebius to n^1/3 */
   UV*   lpf = 0;         /* least prime factor to n^1/3 */
+  cache_t pcache; /* Cache for recursive phi */
   DECLARE_TIMING_VARIABLES;
+
   if (n < SIEVE_LIMIT)
     return _XS_prime_count(2, n);
 
@@ -831,13 +866,12 @@ UV _XS_LMO_pi(UV n)
   mu[0] = 0;
   for (i = 1; i <= n13; i++) {
     UV primei = primes[i];
-    UV j, isquared;
     for (j = primei; j <= n13; j += primei) {
       mu[j] = -mu[j];
       if (lpf[j] == 0) lpf[j] = primei;
     }
-    isquared = primei * primei;
-    for (j = isquared; j <= n13; j += isquared)
+    k = primei * primei;
+    for (j = k; j <= n13; j += k)
       mu[j] = 0;
   }
   lpf[1] = UV_MAX;  /* Set lpf[1] to max */
@@ -846,26 +880,40 @@ UV _XS_LMO_pi(UV n)
   k = (a < 7) ? a : 7;
   S1 = 0;
   S2 = 0;
+  phicache_init(&pcache);
+  TIMING_START;
   for (i = 1; i <= n13; i++)
     if (lpf[i] > primes[k])
-      S1 += mu[i] * phi_small(n/i, k, primes);
+      /* S1 += mu[i] * phi_small(n/i, k, primes, lastprime, &pcache); */
+      S1 += mu[i] * phi(n/i, k);
+  TIMING_END_PRINT("S1")
+
+  TIMING_START;
   for (i = k; i+1 < a; i++)
 #ifdef _OPENMP
-    #pragma omp parallel for reduction(+: S2) schedule(dynamic, 16) num_threads(2)
+    #pragma omp parallel for reduction(+: S2) firstprivate(pcache) schedule(dynamic, 16) num_threads(2)
 #endif
     for (j = (n13/primes[i+1])+1; j <= n13; j++)
       if (lpf[j] > primes[i+1])
-        S2 += -mu[j] * phi_small(n / (j*primes[i+1]), i, primes);
-  phicache_free();
+        S2 += -mu[j] * phi_small(n / (j*primes[i+1]), i, primes, lastprime, &pcache);
+  TIMING_END_PRINT("S2")
+  phicache_free(&pcache);
+  Safefree(lpf);
+  Safefree(mu);
 
-  P2 = Pk_2(n, a);
+  TIMING_START;
+  prime_precalc( (UV) pow(n, 2.9/5.0) );
+  P2 = Pk_2_p(n, a, b, primes, lastprime);
+  TIMING_END_PRINT("P2")
+  Safefree(primes);
 
   /* printf("S1 = %lu\nS2 = %lu\na  = %lu\nP2 = %lu\n", S1, S2, a, P2); */
   sum = (S1 + S2) + a - 1 - P2;
-  Safefree(primes);
   return sum;
 }
 
+UV _XS_legendre_phi(UV x, UV a) { return phi(x,a); }
+
 
 #ifdef PRIMESIEVE_STANDALONE
 int main(int argc, char *argv[])
diff --git a/lehmer.h b/lehmer.h
index 94b46a4..e2c3325 100644
--- a/lehmer.h
+++ b/lehmer.h
@@ -7,5 +7,6 @@ extern UV _XS_legendre_pi(UV n);
 extern UV _XS_meissel_pi(UV n);
 extern UV _XS_lehmer_pi(UV n);
 extern UV _XS_LMO_pi(UV n);
+extern UV _XS_legendre_phi(UV x, UV a);
 
 #endif
diff --git a/lib/Math/Prime/Util.pm b/lib/Math/Prime/Util.pm
index 38cd325..c553cef 100644
--- a/lib/Math/Prime/Util.pm
+++ b/lib/Math/Prime/Util.pm
@@ -1868,13 +1868,15 @@ sub prime_count {
       #                 + 0.0000000057 * $low**0.72;
       #if ($est_lehmer < $est_segment) {
       if ( ($high / ($high-$low+1)) < 100 ) {
-        $low = 2 if $low < 2;
-        return _XS_lehmer_pi($high) - _XS_lehmer_pi($low-1);
+        my $count;
+        $count  = _XS_LMO_pi($high);
+        $count -= _XS_LMO_pi($low-1) if $low > 2;
+        return $count;
       }
     }
     return _XS_prime_count($low,$high);
   }
-  # We can relax these constraints if MPU::GMP gets a Lehmer implementation.
+  # We can relax these constraints if MPU::GMP gets a fast implementation.
   return Math::Prime::Util::GMP::prime_count($low,$high) if $_HAVE_GMP
                        && defined &Math::Prime::Util::GMP::prime_count
                        && (   (ref($high) eq 'Math::BigInt')
@@ -2976,7 +2978,7 @@ count of primes between the ranges (e.g. C<(13,17)> returns 2, C<14,17>
 and C<13,16> return 1, and C<14,16> returns 0).
 
 The current implementation decides based on the ranges whether to use a
-segmented sieve with a fast bit count, or Lehmer's algorithm.  The former
+segmented sieve with a fast bit count, or the LMO algorithm.  The former
 is preferred for small sizes as well as small ranges.  The latter is much
 faster for large ranges.
 
@@ -2986,7 +2988,7 @@ where the first term is typically negligible below C<~ 10^11>.  Memory use
 is proportional only to C<sqrt(a)>, with total memory use under 1MB for any
 base under C<10^14>.
 
-Lehmer's method has complexity approximately C<O(b^0.7) + O(a^0.7)>.  It
+The LMO method has complexity approximately C<O(b^0.7) + O(a^0.7)>.  It
 does use more memory however.  A calculation of C<Pi(10^14)> completes in
 under 1 minute, C<Pi(10^15)> in under 5 minutes, and C<Pi(10^16)> in under
 20 minutes, however using about 500MB of peak memory for the last.
@@ -3057,9 +3059,9 @@ another way, this returns the smallest C<p> such that C<Pi(p) E<gt>= n>.
 For relatively small inputs (below 2 million or so), this does a sieve over
 a range containing the nth prime, then counts up to the number.  This is fairly
 efficient in time and memory.  For larger values, a binary search is performed
-between the Dusart 2010 bounds using Riemann's R function, then Lehmer's fast
+between the Dusart 2010 bounds using Riemann's R function, then a fast
 prime counting method is used to calculate the count up to that point, then
-sieving is done in the typically small error zone.
+sieving is done in the typically small difference zone.
 
 While this method is hundreds of times faster than generating primes, and
 doesn't involve big tables of precomputed values, it still can take a fair
@@ -4414,7 +4416,7 @@ Project Euler, problem 187, stupid brute force solution, ~4 minutes:
 
 Here is the best way for PE187.  Under 30 milliseconds from the command line:
 
-  use Math::Prime::Util qw/primes prime_count -nobigint/;
+  use Math::Prime::Util qw/primes prime_count/;
   use List::Util qw/sum/;
   my $limit = shift || int(10**8);
   my @primes = @{primes(int(sqrt($limit)))};
diff --git a/t/13-primecount.t b/t/13-primecount.t
index 160da98..e2c0975 100644
--- a/t/13-primecount.t
+++ b/t/13-primecount.t
@@ -87,7 +87,7 @@ plan tests => 0 + 1
                 + $use64 * 3 * scalar(keys %pivals64)
                 + scalar(keys %intervals)
                 + 1
-                + 6; # prime count specific methods
+                + 7; # prime count specific methods
 
 ok( eval { prime_count(13); 1; }, "prime_count in void context");
 
@@ -156,11 +156,12 @@ sub parse_range {
 
 # Make sure each specific algorithm isn't broken.
 SKIP: {
-  skip "Not XS -- skipping direct primecount tests", 4 unless $isxs;
+  skip "Not XS -- skipping direct primecount tests", 5 unless $isxs;
   # This has to be above lehmer.c's SIEVE_LIMIT or nothing happens.
   is(Math::Prime::Util::_XS_lehmer_pi  (66123456), 3903023, "XS Lehmer count");
   is(Math::Prime::Util::_XS_meissel_pi (66123456), 3903023, "XS Meissel count");
   is(Math::Prime::Util::_XS_legendre_pi(66123456), 3903023, "XS Legendre count");
+  is(Math::Prime::Util::_XS_LMO_pi     (66123456), 3903023, "XS LMO count");
   is(Math::Prime::Util::_XS_prime_count(66123456), 3903023, "XS sieve count");
 }
 is(Math::Prime::Util::PP::_lehmer_pi   (3456789), 247352, "PP Lehmer count");
diff --git a/t/14-nthprime.t b/t/14-nthprime.t
index 217920b..acfe551 100644
--- a/t/14-nthprime.t
+++ b/t/14-nthprime.t
@@ -6,6 +6,7 @@ use Test::More;
 use Math::Prime::Util qw/primes nth_prime nth_prime_lower nth_prime_upper nth_prime_approx/;
 
 my $use64 = Math::Prime::Util::prime_get_config->{'maxbits'} > 32;
+my $usexs = Math::Prime::Util::prime_get_config->{'xs'};
 my $extra = defined $ENV{EXTENDED_TESTING} && $ENV{EXTENDED_TESTING};
 my $broken64 = (18446744073709550592 == ~0);
 
@@ -59,7 +60,7 @@ plan tests => 0 + 2*scalar(keys %pivals32)
                 + $use64 * 3 * scalar(keys %nthprimes64)
                 + 4
                 + 3
-                + (($extra && $use64) ? 1 : 0);
+                + (($extra && $use64 && $usexs) ? 1 : 0);
 
 
 while (my($n, $pin) = each (%pivals32)) {
@@ -115,7 +116,7 @@ like($@, qr/overflow/, "nth_prime_approx($overindex) overflows");
 eval { nth_prime($overindex); };
 like($@, qr/overflow/, "nth_prime($overindex) overflows");
 
-if ($extra && $use64) {
+if ($extra && $use64 && $usexs) {
   # Test an nth prime value that uses the binary-search-on-R(n) algorithm
   is( nth_prime(21234567890), 551990503367, "nth_prime(21234567890)" );
 }
diff --git a/t/19-moebius.t b/t/19-moebius.t
index dbc75ef..19011c5 100644
--- a/t/19-moebius.t
+++ b/t/19-moebius.t
@@ -9,6 +9,7 @@ use Math::Prime::Util
 
 my $extra = defined $ENV{EXTENDED_TESTING} && $ENV{EXTENDED_TESTING};
 my $use64 = Math::Prime::Util::prime_get_config->{'maxbits'} > 32;
+my $usexs = Math::Prime::Util::prime_get_config->{'xs'};
 my $broken64 = (18446744073709550592 == ~0);
 $use64 = 0 if $broken64;
 
@@ -182,6 +183,13 @@ my @mult_orders = (
   [31407,2147475467,266],
 );
 
+# These are slow with XS, and *really* slow with PP.
+if (!$usexs) {
+  %big_mertens = map { $_ => $big_mertens{$_} }
+                 grep { $_ < 100000000 }
+                 keys %big_mertens;
+}
+
 plan tests => 0 + 1
                 + 1 # Small Moebius
                 + 3*scalar(keys %mertens)
diff --git a/t/31-threading.t b/t/31-threading.t
index 7c1315a..ba5104e 100644
--- a/t/31-threading.t
+++ b/t/31-threading.t
@@ -5,7 +5,7 @@ use warnings;
 use Config;
 BEGIN {
   if (! $Config{useithreads} || $] < 5.008) {
-    print("1..0 # Skip Threads not supported\n");
+    print("1..0 # Skip perl isn't compiled with threading support\n");
     exit(0);
   }
   # Should be be looking for newer than 5.008?
diff --git a/util.c b/util.c
index 7eaaad7..c642b26 100644
--- a/util.c
+++ b/util.c
@@ -666,7 +666,7 @@ UV _XS_nth_prime(UV n)
       /* Calculate lower limit, get count, sieve to that */
       segment_size = lower_limit / 30;
       lower_limit = 30 * segment_size - 1;
-      count = _XS_lehmer_pi(lower_limit) - 3;
+      count = _XS_LMO_pi(lower_limit) - 3;
       MPUassert(count <= target, "Pi(nth_prime_lower(n))) > n");
     } else {
       /* Compute approximate nth prime via binary search on R(n) */
@@ -684,7 +684,7 @@ UV _XS_nth_prime(UV n)
       lower_limit = (UV) (double)(0.9999999*(lo-1));
       segment_size = lower_limit / 30;
       lower_limit = 30 * segment_size - 1;
-      count = _XS_lehmer_pi(lower_limit);
+      count = _XS_LMO_pi(lower_limit);
       /*
         printf("We've estimated %lu too %s.\n", (count>n)?count-n:n-count, (count>n)?"FAR":"little");
         printf("Our limit %lu %s a prime\n", lower_limit, _XS_is_prime(lower_limit) ? "is" : "is not");

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git