[libmath-prime-util-perl] 09/25: Extended LMO algorithm

Thu May 21 18:50:38 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.35
in repository libmath-prime-util-perl.

commit 69f65edb812c72e9663cd2bcd67a26774b276998
Author: Dana Jacobsen <dana at acm.org>
Date:   Thu Dec 5 08:48:23 2013 -0800

    Extended LMO algorithm
---
 .gitignore             |   1 +
 Changes                |   5 +
 Makefile.PL            |   1 +
 XS.xs                  |   3 +
 lehmer.c               |  36 ++-
 lehmer.h               |   3 +-
 lib/Math/Prime/Util.pm |   6 +-
 lmo.c                  | 628 +++++++++++++++++++++++++++++++++++++++++++++++++
 lmo.h                  |   8 +
 util.c                 |   2 +-
 10 files changed, 682 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 55be186..79f9d4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ factor.o
 sieve.o
 util.o
 lehmer.o
+lmo.o
 primality.o
 aks.o
 blib*
diff --git a/Changes b/Changes
index 64c32ff..0a5732e 100644
--- a/Changes
+++ b/Changes
@@ -18,10 +18,15 @@ Revision history for Perl module Math::Prime::Util
 
     [FUNCTIONALITY AND PERFORMANCE]
 
+    - Switched to extended LMO algorithm for prime_count.  Much better memory
+      use and much faster for large values.  Speeds up nth_prime also.
+
     - Some fixes for 32-bit.
 
     - prime_count_approx, upper, and lower return exact answers in more cases.
 
+    - Fixed a problem with Lehmer prime_count introduced in 0.34.
+
 
 0.34  2013-11-19
 
diff --git a/Makefile.PL b/Makefile.PL
index 02ce49a..3a9d727 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -27,6 +27,7 @@ WriteMakefile1(
                     'primality.o '.
                     'aks.o '      .
                     'lehmer.o '   .
+                    'lmo.o '      .
                     'sieve.o '    .
                     'util.o '     .
                     'XS.o',
diff --git a/XS.xs b/XS.xs
index de47b4e..4832367 100644
--- a/XS.xs
+++ b/XS.xs
@@ -17,6 +17,7 @@
 #include "primality.h"
 #include "factor.h"
 #include "lehmer.h"
+#include "lmo.h"
 #include "aks.h"
 
 #if BITS_PER_WORD == 64 && defined(_MSC_VER)
@@ -202,6 +203,7 @@ _XS_nth_prime(IN UV n)
     _XS_meissel_pi = 4
     _XS_lehmer_pi = 5
     _XS_LMO_pi = 6
+    _XS_LMOS_pi = 7
   PREINIT:
     UV ret;
   CODE:
@@ -213,6 +215,7 @@ _XS_nth_prime(IN UV n)
       case 4: ret = _XS_meissel_pi(n); break;
       case 5: ret = _XS_lehmer_pi(n); break;
       case 6: ret = _XS_LMO_pi(n); break;
+      case 7: ret = _XS_LMOS_pi(n); break;
       default: croak("_XS_nth_prime: Unknown function alias"); break;
     }
     RETVAL = ret;
diff --git a/lehmer.c b/lehmer.c
index 29ad8d7..368d194 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -723,8 +723,7 @@ static UV Pk_2_p(UV n, UV a, UV b, const uint32_t* primes, uint32_t lastidx)
 }
 static UV Pk_2(UV n, UV a, UV b)
 {
-  UV lastprime = b*SIEVE_MULT+1;
-  if (lastprime > 203280221) lastprime = 203280221;
+  UV lastprime = ((b*SIEVE_MULT+1) > 203280221) ? 203280221 : b*SIEVE_MULT+1;
   const uint32_t* primes = generate_small_primes(lastprime);
   UV P2 = Pk_2_p(n, a, b, primes, lastprime);
   Safefree(primes);
@@ -842,7 +841,7 @@ UV _XS_lehmer_pi(UV n)
  * About the same speed as Lehmer, a bit less memory.
  * A better implementation can be 10-50x faster and much less memory.
  */
-UV _XS_LMO_pi(UV n)
+UV _XS_LMOS_pi(UV n)
 {
   UV n13, a, b, sum, i, j, k, lastprime, P2, S1, S2;
   const uint32_t* primes = 0;  /* small prime cache */
@@ -921,7 +920,34 @@ UV _XS_LMO_pi(UV n)
   return sum;
 }
 
-UV _XS_legendre_phi(UV x, UV a) { return phi(x,a); }
+static const unsigned char primes_small[] =
+  {0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
+   101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191};
+#define NPRIMES_SMALL (sizeof(primes_small)/sizeof(primes_small[0]))
+
+UV _XS_legendre_phi(UV x, UV a) {
+  /* For small values, calculate directly */
+  if (a < 3)  return (a == 0) ? x : (a == 1) ? x-x/2 : x-x/2-x/3+x/6;
+  if (a <= 7) return mapes(x, a);
+  /* For large values, do our non-recursive phi */
+  if (a > NPRIMES_SMALL) return phi(x,a);
+  /* Otherwise, recurse */
+  {
+    UV i;
+    UV sum = mapes7(x);
+    for (i = 8; i <= a; i++) {
+      uint32_t p = primes_small[i];
+      UV xp = x/p;
+      if (xp < p) {
+        while (x < primes_small[a])
+          a--;
+        return (sum - a + i - 1);
+      }
+      sum -= _XS_legendre_phi(xp, i-1);
+    }
+    return sum;
+  }
+}
 
 
 #ifdef PRIMESIEVE_STANDALONE
@@ -947,7 +973,7 @@ int main(int argc, char *argv[])
   if      (!strcasecmp(method, "lehmer"))   { pi = _XS_lehmer_pi(n);      }
   else if (!strcasecmp(method, "meissel"))  { pi = _XS_meissel_pi(n);     }
   else if (!strcasecmp(method, "legendre")) { pi = _XS_legendre_pi(n);    }
-  else if (!strcasecmp(method, "lmo"))      { pi = _XS_LMO_pi(n);         }
+  else if (!strcasecmp(method, "lmo"))      { pi = _XS_LMOS_pi(n);  }
   else if (!strcasecmp(method, "sieve"))    { pi = _XS_prime_count(2, n); }
   else {
     printf("method must be one of: lehmer, meissel, legendre, lmo, or sieve\n");
diff --git a/lehmer.h b/lehmer.h
index e2c3325..29b4f53 100644
--- a/lehmer.h
+++ b/lehmer.h
@@ -6,7 +6,8 @@
 extern UV _XS_legendre_pi(UV n);
 extern UV _XS_meissel_pi(UV n);
 extern UV _XS_lehmer_pi(UV n);
-extern UV _XS_LMO_pi(UV n);
+extern UV _XS_LMOS_pi(UV n);
+
 extern UV _XS_legendre_phi(UV x, UV a);
 
 #endif
diff --git a/lib/Math/Prime/Util.pm b/lib/Math/Prime/Util.pm
index faf70fd..eba5a3b 100644
--- a/lib/Math/Prime/Util.pm
+++ b/lib/Math/Prime/Util.pm
@@ -1791,10 +1791,8 @@ sub prime_count {
       #if ($est_lehmer < $est_segment) {
       if ( ($high / ($high-$low+1)) < 100 ) {
         my $count;
-        $count  =   ($high > 8_000_000_000) ? _XS_LMO_pi($high)  : _XS_lehmer_pi($high);
-        if ($low > 2) {
-          $count -= ($low  > 8_000_000_000) ? _XS_LMO_pi($low-1) : _XS_lehmer_pi($low-1);
-        }
+        $count  =  _XS_LMO_pi($high);
+        $count -=  _XS_LMO_pi($low-1) if $low > 2;
         return $count;
       }
     }
diff --git a/lmo.c b/lmo.c
new file mode 100644
index 0000000..eb37c8d
--- /dev/null
+++ b/lmo.c
@@ -0,0 +1,628 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+/*****************************************************************************
+ *
+ * Prime counts using the extended Lagarias-Miller-Odlyzko combinatorial method.
+ *
+ * Copyright (c) 2013 Dana Jacobsen (dana at acm.org)
+ * This is free software; you can redistribute it and/or modify it under
+ * the same terms as the Perl 5 programming language system itself.
+ *
+ * This file is part of the Math::Prime::Util Perl module, but it should
+ * not be difficult to turn it into standalone code.
+ *
+ * The main references I used were:
+ *  - Christian Bau's paper and example implementation, 2003, Christian Bau
+ *    This was of immense help.  References to "step #" refer to this preprint.
+ *  - "Computing Pi(x): the combinatorial method", 2006, Tomás Oliveira e Silva
+ *  - "Computing Pi(x): The Meissel, Lehmer, Lagarias, Miller, Odlyzko Method"
+ *    1996, Deléglise and Rivat.
+ *
+ * Comparisons to the other prime counting implementations here:
+ *
+ * Meissel: Combinatorial phi.  Simple implementation.
+ * Lehmer:  Combinatorial phi.  Memory use grows rapidly.
+ * LMOS:    Combinatorial phi.  Very basic LMO implementation.
+ * LMO:     Sieve phi.  10-50x faster than LMOS, better growth rate,
+ *          Much, much better memory use than the others.
+ *
+ * Performance is slightly worse than Christian Bau's implementation, but
+ * in theory this implementation has more parallelization opportunities.
+ * Timing below is single core using MPU.
+ *
+ *  |   n   |  Meissel |  Lehmer  |   LMOS   |    LMO    |
+ *  +-------+----------+----------+----------+-----------+
+ *  | 10^19 |          |          |          | 3707.43   |
+ *  | 10^18 |          |          |          |  778.70   |
+ *  | 10^17 |          |          |          |  172.12   |
+ *  | 10^16 | 1410.2   | 1043.6   | 1058.9   |   37.351  |
+ *  | 10^15 |  137.1   |  137.3   |  142.7   |    8.072  |
+ *  | 10^14 |   26.18  |   21.74  |   21.29  |    1.906  |
+ *  | 10^13 |    5.155 |    3.947 |    3.353 |    0.455  |
+ *  | 10^12 |    1.059 |    0.700 |    0.626 |    0.111  |
+ *  | 10^11 |    0.227 |    0.138 |    0.124 |    0.0269 |
+ *  | 10^10 |    0.0509|    0.0309|    0.0286|    0.00702|
+ */
+
+/* Below this size, just sieve (with table speedup). */
+#define SIEVE_LIMIT  60000000
+/* Adjust to get best performance */
+#define M_FACTOR(n)     (UV) ((double)n * (log(n)/log(3)))
+/* Size of segment used for previous primes, must be >= 21 */
+#define PREV_SIEVE_SIZE 512
+/* Phi sieve multiplier, adjust for best performance */
+#define PHI_SIEVE_MULT 13
+
+#include "lmo.h"
+#include "util.h"
+#include "cache.h"
+#include "sieve.h"
+
+typedef unsigned char  uint8;
+typedef unsigned short uint16;
+typedef uint32_t       uint32;
+
+static UV icbrt(UV n) {
+  UV b, root = 0;
+  int s = 63;
+  if (n >= UVCONST(18446724184312856125)) return UVCONST(2642245);
+  for (; s >= 0; s -= 3) {
+    root += root;
+    b = 3*root*(root+1)+1;
+    if ((n >> s) >= b) {
+      n -= b << s;
+      root++;
+    }
+  }
+  return root;
+}
+
+static const unsigned char byte_ones[256] =
+  {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+   1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+   1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+   2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+   1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+   2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+   2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+   3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};
+
+static uint32_t bitcount(uint32_t b) {
+  /* The simple table method is faster than __builtin_popcount or
+   * using a 11-bit word table.  It is slower than the Nahalem asm. */
+#if 1
+  return byte_ones[b&0xFF] + byte_ones[(b>>8)&0xFF] + byte_ones[(b>>16)&0xFF] + byte_ones[b>>24];
+#else
+  uint32 ret;
+  __asm__("popcnt %1, %0" : "=r" (ret) : "r" (b));
+  return ret;
+#endif
+}
+
+/* static uint32 count_one_bits(const unsigned char* m, uint32 nbytes) {
+  uint32 count = 0;
+  while (nbytes--)  count += byte_ones[*m++];
+  return count;
+} */
+
+/* Create array of small primes: 0,2,3,5,...,prev_prime(n+1) */
+static uint32_t* make_primelist(uint32 n, uint32* number_of_primes)
+{
+  uint32 i = 0;
+  uint32_t* plist;
+  double logn = log(n);
+  uint32 max_index = (n < 67)     ? 18
+                   : (n < 355991) ? 15+(n/(logn-1.09))
+                   : (n/log(n)) * (1.0+1.0/logn+2.51/(logn*logn));
+  *number_of_primes = 0;
+  New(0, plist, max_index+1, uint32_t);
+  if (plist == 0)
+    croak("Can not allocate small primes\n");
+  plist[0] = 0;
+  START_DO_FOR_EACH_PRIME(2, n) {
+    plist[++i] = p;
+  } END_DO_FOR_EACH_PRIME;
+  *number_of_primes = i;
+  return plist;
+}
+
+/* Given a max prime in small prime list, return max prev prime input */
+static uint32 prev_sieve_max(UV maxprime) {
+  UV limit = maxprime*maxprime - (maxprime*maxprime % (16*PREV_SIEVE_SIZE)) - 1;
+  return (limit > U32_CONST(4294967295)) ? U32_CONST(4294967295) : limit;
+}
+
+/* Simple SoE filling a segment */
+static void _prev_sieve_fill(uint32 start, uint8* sieve, const uint32_t* primes) {
+  uint32 i, j, p;
+  memset( sieve, 0xFF, PREV_SIEVE_SIZE );
+  for (i = 2, p = 3; p*p < start + (16*PREV_SIEVE_SIZE); p = primes[++i])
+    for (j = (start == 0) ? p*p/2 : (p-1) - ((start+(p-1))/2) % p;
+         j < (8*PREV_SIEVE_SIZE); j += p)
+      sieve[j/8] &= ~(1U << (j%8));
+}
+
+/* Calculate previous prime using small segment */
+static uint32 prev_sieve_prime(uint32 n, uint8* sieve, uint32* segment_start, uint32 sieve_max, const uint32_t* primes)
+{
+  uint32 sieve_start, bit_offset;
+  if (n <= 3) return (n == 3) ? 2 : 0;
+  if (n > sieve_max) croak("ps overflow\n");
+
+  /* If n > 3 && n <= sieve_max, then there is an odd prime we can find. */
+  n -= 2;
+  bit_offset = n % (16*PREV_SIEVE_SIZE);
+  sieve_start = n - bit_offset;
+  bit_offset >>= 1;
+
+  while (1) {
+    if (sieve_start != *segment_start) {  /* Fill sieve if necessary */
+      _prev_sieve_fill(sieve_start, sieve, primes);
+      *segment_start = sieve_start;
+    }
+    do {                                  /* Look for a set bit in sieve */
+      if (sieve[bit_offset / 8] & (1u << (bit_offset % 8)))
+        return sieve_start + 2*bit_offset + 1;
+    } while (bit_offset-- > 0);
+    sieve_start -= (16 * PREV_SIEVE_SIZE);
+    bit_offset = ((16 * PREV_SIEVE_SIZE) - 1) / 2;
+  }
+}
+
+/* Create factor table.
+ * In lehmer.c we create mu and lpf arrays, which I suspect is faster.
+ * Here we use Christian Bau's method, which is more memory efficient.
+ * TODO: Neither the speed nor space really matter, but we should measure them.
+ *
+ * In a uint16 we have stored:
+ *    0     moebius(n) = 0
+ *    even  moebius(n) = 1
+ *    odd   moebius(n) = -1   (last bit indicates even/odd number of factors)
+ *    v     smallest odd prime factor of n is v&1
+ *    65535 large prime
+ */
+static uint16* ft_create(uint32 max)
+{
+  uint16* factor_table;
+  uint32 i;
+  uint32 tableLimit = max + 338 + 1;  /* At least one more prime */
+  uint32 tableSize = tableLimit/2;
+  uint32 max_prime = (tableLimit - 1) / 3 + 1;
+
+  New(0, factor_table, tableSize, uint16);
+  if (factor_table == 0)  return 0;
+
+  /* Set all values to 65535 (a large prime), set 0 to 65534. */
+  factor_table[0] = 65534;
+  for (i = 1; i < tableSize; ++i)
+    factor_table[i] = 65535;
+  
+  /* Process each odd. */
+  for (i = 1; i < tableSize; ++i) {
+    uint32 factor, max_factor;
+    uint32 p = i*2+1;
+    if (factor_table[i] != 65535)  /* Already marked. */
+      continue;
+    if (p < 65535)                 /* p is a small prime, so set the number. */
+      factor_table[i] = p;
+    if (p >= max_prime)            /* No multiples will be in the table */
+      continue;
+
+    max_factor = (tableLimit - 1) / p + 1;
+    /* Look for odd multiples of the prime p. */
+    for (factor = 3; factor < max_factor; factor += 2) {
+      uint32 index = (p*factor)/2;
+      if (factor_table[index] == 65535)  /* p is smallest factor */
+        factor_table[index] = p;
+      else if (factor_table[index] > 0)  /* Change number of factors */
+        factor_table[index] ^= 0x01;
+    }
+
+    /* Change all odd multiples of p*p to 0 to indicate non-square-free. */
+    for (factor = p; factor < max_factor; factor += 2*p)
+      factor_table[ (p*factor) / 2] = 0;
+  }
+  return factor_table;
+}
+
+
+static UV mapes(UV x, uint32 a)
+{
+  UV val;
+  if (a == 0)  return x;
+  if (a == 1)  return x-x/2;
+  val = x-x/2-x/3+x/6;
+  if (a >= 3) val += 0-x/5+x/10+x/15-x/30;
+  if (a >= 4) val += 0-x/7+x/14+x/21-x/42+x/35-x/70-x/105+x/210;
+  if (a >= 5) val += 0-x/11+x/22+x/33-x/66+x/55-x/110-x/165+x/330+x/77-x/154-x/231+x/462-x/385+x/770+x/1155-x/2310;
+  if (a >= 6) val += 0-x/13+x/26+x/39-x/78+x/65-x/130-x/195+x/390+x/91-x/182-x/273+x/546-x/455+x/910+x/1365-x/2730+x/143-x/286-x/429+x/858-x/715+x/1430+x/2145-x/4290-x/1001+x/2002+x/3003-x/6006+x/5005-x/10010-x/15015+x/30030;
+  if (a >= 7) val += 0-x/17+x/34+x/51-x/102+x/85-x/170-x/255+x/510+x/119-x/238-x/357+x/714-x/595+x/1190+x/1785-x/3570+x/187-x/374-x/561+x/1122-x/935+x/1870+x/2805-x/5610-x/1309+x/2618+x/3927-x/7854+x/6545-x/13090-x/19635+x/39270+x/221-x/442-x/663+x/1326-x/1105+x/2210+x/3315-x/6630-x/1547+x/3094+x/4641-x/9282+x/7735-x/15470-x/23205+x/46410-x/2431+x/4862+x/7293-x/14586+x/12155-x/24310-x/36465+x/72930+x/17017-x/34034-x/51051+x/102102-x/85085+x/170170+x/255255-x/510510;
+  return (UV) val;
+}
+
+typedef struct {
+  uint32_t *sieve;                 /* segment bit mask */
+  uint8    *word_count;            /* bit count in each 64-bit word */
+  uint32   *word_count_sum;        /* cumulative sum of word_count */
+  UV       *totals;                /* total bit count for all phis at index */
+  uint32   *prime_index;           /* index of prime where phi(n/p/p(k+1))=1 */
+  uint32   *first_bit_index;       /* offset relative to start for this prime */
+  uint8    *multiplier;            /* mod-30 wheel of each prime */
+  UV        start;                 /* x value of first bit of segment */
+  uint32    size;                  /* segment size in bits */
+  uint32    first_prime;           /* index of first prime in segment */
+  uint32    last_prime;            /* index of last prime in segment */
+  uint32    last_prime_to_remove;  /* index of last prime p, p^2 in segment */
+} sieve_t;
+
+/* Size of phi sieve in 32-bit words.  Multiple of 2*2*3*5*7*11 bytes. */
+#define PHI_SIEVE_WORDS (1155 * PHI_SIEVE_MULT)
+
+/* Bit counting using cumulative sums.  A bit slower than using a running sum,
+ * but a little simpler and can be run in parallel. */
+static void make_sieve_counts(const uint32_t* sieve, uint32 sieve_size, uint8* sieve_word_count) {
+  uint32 lwords = (sieve_size + 127) / 128;
+  while (lwords--) {
+    *sieve_word_count++ = bitcount(sieve[0]) + bitcount(sieve[1]);
+    sieve += 2;
+  }
+}
+static uint32 make_sieve_sums(uint32 sieve_size, const uint8* sieve_word_count, uint32* sieve_word_count_sum) {
+  uint32 i, bc, lwords = (sieve_size + 127) / 128;
+  sieve_word_count_sum[0] = 0;
+  for (i = 0, bc = 0; i+7 < lwords; i += 8) {
+    const uint8* cntptr = sieve_word_count + i;
+    uint32* sumptr = sieve_word_count_sum + i;
+    sumptr[1] = bc += cntptr[0];
+    sumptr[2] = bc += cntptr[1];
+    sumptr[3] = bc += cntptr[2];
+    sumptr[4] = bc += cntptr[3];
+    sumptr[5] = bc += cntptr[4];
+    sumptr[6] = bc += cntptr[5];
+    sumptr[7] = bc += cntptr[6];
+    sumptr[8] = bc += cntptr[7];
+  }
+  for (; i < lwords; i++)
+    sieve_word_count_sum[i+1] = sieve_word_count_sum[i] + sieve_word_count[i];
+  return sieve_word_count_sum[lwords];
+}
+
+static UV _sieve_phi(UV segment_x, const uint32_t* sieve, const uint32* sieve_word_count_sum) {
+  uint32 bits = (segment_x + 1) / 2;
+  uint32 words = bits / 32;
+  uint32 sieve_sum = sieve_word_count_sum[words/2];
+  if (words & 1) sieve_sum += bitcount( sieve[words-1] );
+  sieve_sum += bitcount( sieve[words] & ~(0xfffffffful << (bits % 32)) );
+  return sieve_sum;
+}
+
+/* Erasing primes from the sieve is done using Christian Bau's
+ * case statement walker.  It's not pretty, but it is short, fast,
+ * clever, and does the job. */
+
+#define sieve_zero(sieve, si, wordcount) \
+  { uint32 index = si/32; \
+    uint32 mask = 1UL << (si % 32); \
+    if (sieve[index] & mask) { \
+      sieve[index] &= ~mask; \
+      wordcount[index/2]--; \
+    }  }
+
+#define sieve_case_zero(casenum, skip, si, p, size, mult, sieve, wordcount) \
+  case casenum: sieve_zero(sieve, si, wordcount); \
+                si += skip * p; \
+                mult = (casenum+1) % 8; \
+                if (si >= size) break;
+
+static UV remove_primes(uint32 index, uint32 last_index, sieve_t* s, const uint32_t* primes)
+{
+  uint32    size = (s->size + 1) / 2;
+  uint32_t *sieve = s->sieve;
+  uint8    *word_count = s->word_count;
+  UV        total = s->totals[last_index];
+
+  for ( ;index <= last_index; index++) {
+    if (index >= s->first_prime && index <= s->last_prime) {
+      uint32 b = (primes[index] - (uint32) s->start - 1) / 2;
+      sieve_zero(sieve, b, word_count);
+    }
+    if (index <= s->last_prime_to_remove) {
+      uint32 b = s->first_bit_index[index];
+      if (b < size) {
+        uint32 p    = primes[index];
+        uint32 mult = s->multiplier[index];
+        switch (mult) {
+          reloop: ;
+            sieve_case_zero(0, 3, b, p, size, mult, sieve, word_count);
+            sieve_case_zero(1, 2, b, p, size, mult, sieve, word_count);
+            sieve_case_zero(2, 1, b, p, size, mult, sieve, word_count);
+            sieve_case_zero(3, 2, b, p, size, mult, sieve, word_count);
+            sieve_case_zero(4, 1, b, p, size, mult, sieve, word_count);
+            sieve_case_zero(5, 2, b, p, size, mult, sieve, word_count);
+            sieve_case_zero(6, 3, b, p, size, mult, sieve, word_count);
+            sieve_case_zero(7, 1, b, p, size, mult, sieve, word_count);
+          goto reloop;
+        }
+        s->multiplier[index] = mult;
+      }
+      s->first_bit_index[index] = b - size;
+    }
+  }
+  s->totals[last_index] += make_sieve_sums(s->size, s->word_count, s->word_count_sum);
+  return total;
+}
+
+static void word_tile (uint32_t* source, uint32 from, uint32 to) {
+  while (from < to) {
+    uint32 words = (2*from > to) ? to-from : from;
+    memcpy(source+from, source, 4*words);
+    from += words;
+  }
+}
+
+static UV init_segment(sieve_t* s, UV segment_start, uint32 size, uint32 start_prime_index, uint32 sieve_last, const uint32_t* primes)
+{
+  uint32    i, words;
+  uint32_t* sieve = s->sieve;
+  uint8*    word_count = s->word_count;
+
+  s->start = segment_start;
+  s->size  = size;
+
+  if (segment_start == 0) {
+    s->last_prime = 0;
+    s->last_prime_to_remove = 0;
+  }
+  s->first_prime = s->last_prime + 1;
+  while (s->last_prime < sieve_last) {
+    uint32 p = primes[s->last_prime + 1];
+    if (p >= segment_start + size)
+      break;
+    s->last_prime++;
+  }
+  while (s->last_prime_to_remove < sieve_last) {
+    UV p = primes[s->last_prime_to_remove + 1];
+    UV p2 = p*p;
+    if (p2 >= segment_start + size)
+      break;
+    s->last_prime_to_remove++;
+    s->first_bit_index[s->last_prime_to_remove] = (p2 - segment_start - 1) / 2;
+    s->multiplier[s->last_prime_to_remove] = (p % 30) * 8 / 30;
+  }
+
+  memset(sieve, 0xFF, 3*4);        /* Set first 3 words to all 1 bits */
+  if (start_prime_index >= 3)      /* Remove multiples of 3. */
+    for (i = 3/2; i < 3 * 32; i += 3)
+      sieve[i / 32] &= ~(1ul << (i % 32));
+
+  word_tile(sieve, 3, 15);         /* Copy to first 15 = 3*5 words */
+  if (start_prime_index >= 3)      /* Remove multiples of 5. */
+    for (i = 5/2; i < 15 * 32; i += 5)
+      sieve[i / 32] &= ~(1ul << (i % 32));
+
+  word_tile(sieve, 15, 105);       /* Copy to first 105 = 3*5*7 words */
+  if (start_prime_index >= 4)      /* Remove multiples of 7. */
+    for (i = 7/2; i < 105 * 32; i += 7)
+      sieve[i / 32] &= ~(1ul << (i % 32));
+
+  word_tile(sieve, 105, 1155);     /* Copy to first 1155 = 3*5*7*11 words */
+  if (start_prime_index >= 5)      /* Remove multiples of 11. */
+    for (i = 11/2; i < 1155 * 32; i += 11)
+      sieve[i / 32] &= ~(1ul << (i % 32));
+
+  size = (size+1) / 2;             /* size to odds */
+  words = (size + 31) / 32;        /* sieve size in 32-bit words */
+  word_tile(sieve, 1155, words);   /* Copy first 1155 words to rest */
+  /* Zero all unused bits and words */
+  if (size % 32)
+    sieve[words-1] &= ~ (0xffffffffUL << (size % 32));
+  memset(sieve + words, 0x00, 4*(PHI_SIEVE_WORDS+2-words));
+
+  /* Now remove the rest of the primes and create counts+sums. */
+  make_sieve_counts(sieve, s->size, word_count);
+  for (i = 6; i <= start_prime_index; i++) {
+    if (i >= s->first_prime && i <= s->last_prime) {
+      uint32 b = (primes[i] - segment_start - 1) / 2;
+      sieve_zero(sieve, b, word_count);
+    }
+  }
+  return remove_primes(6, start_prime_index, s, primes);
+}
+
+/* However we want to handle reduced prime counts */
+#define simple_pi(n)  _XS_LMO_pi(n)
+/* Macros to hide all the variables being passed */
+#define prev_sieve_prime(n) \
+  prev_sieve_prime(n, &prev_sieve[0], &ps_start, ps_max, primes)
+#define sieve_phi(x) \
+  phi_total + _sieve_phi((x) - ss.start, ss.sieve, ss.word_count_sum)
+
+
+UV _XS_LMO_pi(UV n)
+{
+  UV        N2, N3, K2, K3, M, sum1, sum2, phi_total, phi_value;
+  UV        sieve_start, sieve_end, least_divisor, step7_max, last_phi_sieve;
+  uint32    j, k, piM, KM, end, prime, prime_index;
+  uint32    ps_start, ps_max, smallest_divisor, nprimes;
+  uint8     prev_sieve[PREV_SIEVE_SIZE];
+  uint32_t *primes;
+  uint16   *factor_table;
+  sieve_t   ss;
+
+  const uint32 c = 7;  /* We can use out Mapes function for c <= 7 */
+
+  /* For "small" n, use our table+segment sieve. */
+  if (n < SIEVE_LIMIT || n < 10000)  return _XS_prime_count(2, n);
+  /* n should now be reasonably sized (not tiny). */
+
+  N2 = isqrt(n);             /* floor(N^1/2) */
+  N3 = icbrt(n);             /* floor(N^1/3) */
+  K2 = simple_pi(N2);        /* Pi(N2) */
+  K3 = simple_pi(N3);        /* Pi(N3) */
+
+  /* M is N^1/3 times a tunable performance factor. */
+  M = M_FACTOR(N3);
+  if (M >= N2) M = N2 - 1;         /* M must be smaller than N^1/2 */
+  if (M < N3) M = N3;              /* M must be at least N^1/3 */
+
+  /* Create the array of small primes, and least-prime-factor/moebius table */
+  primes = make_primelist( M + 500, &nprimes );
+  factor_table = ft_create( M );
+  if (primes == 0 || factor_table == 0)
+    croak("Allocation failure in LMO Pi\n");
+
+  /* Create other arrays */
+  New(0, ss.sieve,           PHI_SIEVE_WORDS   + 2, uint32_t);
+  New(0, ss.word_count,      PHI_SIEVE_WORDS/2 + 2, uint8);
+  New(0, ss.word_count_sum,  PHI_SIEVE_WORDS/2 + 2, uint32);
+  New(0, ss.totals,          K3+2, UV);
+  New(0, ss.prime_index,     K3+2, uint32);
+  New(0, ss.first_bit_index, K3+2, uint32);
+  New(0, ss.multiplier,      K3+2, uint8);
+
+  if (ss.sieve == 0 || ss.word_count == 0 || ss.word_count_sum == 0 ||
+      ss.totals == 0 || ss.prime_index == 0 || ss.first_bit_index == 0 ||
+      ss.multiplier == 0)
+    croak("Allocation failure in LMO Pi\n");
+
+  /* Variables for fast prev_prime using small segment sieves (up to M^2) */
+  ps_max   = prev_sieve_max( primes[nprimes] );
+  ps_start = U32_CONST(0xFFFFFFFF);
+
+  /* Look for the smallest divisor: the smallest number > M which is
+   * square-free and not divisible by any prime covered by our Mapes
+   * small-phi case.  The largest value we will look up in the phi
+   * sieve is n/smallest_divisor. */
+  for (j = (M+1)/2; factor_table[j] <= primes[c]; j++) /* */;
+  smallest_divisor = 2*j+1;
+  /* largest_divisor = (N2 > (UV)M * (UV)M)  ?  N2  :  (UV)M * (UV)M; */
+
+  M = smallest_divisor - 1;   /* Increase M if possible */
+  piM = simple_pi(M);
+  if (piM < c)  croak("N too small for LMO\n");
+  last_phi_sieve = n / smallest_divisor + 1;
+
+  /* KM = smallest k, c <= k <= piM, s.t. primes[k+1] * primes[k+2] > M. */
+  for (KM = c; primes[KM+1] * primes[KM+2] <= M && KM < piM; KM++) /* */;
+  if (K3 < KM)  K3 = KM;  /* Ensure K3 >= KM */
+
+  /* Start calculating Pi(n).  Steps 4-10 from Bau. */
+  sum1 = (K2 - 1) + (UV) (piM - K3 - 1) * (UV) (piM - K3) / 2;
+  sum2 = 0;
+  end = (M+1)/2;
+
+  /* Start at index K2, which is the prime preceeding N^1/2 */
+  prime = prev_sieve_prime(N2+1);
+  prime_index = K2 - 1;
+  step7_max = K3;
+
+  /* Step 4:  For 1 <= x <= M where x is square-free and has no
+   * factor <= primes[c], sum phi(n / x, c). */
+  for (j = 0; j < end; j++) {
+    uint32 lpf = factor_table[j];
+    if (lpf > primes[c]) {
+      phi_value = mapes(n / (2*j+1), c);   /* x = 2j+1 */
+      if (lpf & 0x01) sum2 += phi_value; else sum1 += phi_value;
+    }
+  }
+
+  /* Step 5:  For 1+M/primes[c+1] <= x <= M, x square-free and
+   * has no factor <= primes[c+1], sum phi(n / (x*primes[c+1]), c). */
+  if (c < piM) {
+    UV pc_1 = primes[c+1];
+    for (j = (1+M/pc_1)/2; j < end; j++) {
+      uint32 lpf = factor_table[j];
+      if (lpf > pc_1) {
+        phi_value = mapes(n / (pc_1 * (2*j+1)), c);   /* x = 2j+1 */
+        if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value;
+      }
+    }
+  }
+
+  for (k = 0; k <= K3; k++)     ss.totals[k] = 0;
+  for (k = 0; k < KM; k++)      ss.prime_index[k] = end;
+
+  /* Instead of dividing by all primes up to pi(M), once a divisor is large
+   * enough then phi(n / (p*primes[k+1]), k) = 1. */
+  {
+    uint32 last_prime = piM;
+    for (k = KM; k < K3; k++) {
+      UV pk = primes[k+1];
+      while (last_prime > k+1 && pk * pk * primes[last_prime] > n)
+        last_prime--;
+      ss.prime_index[k] = last_prime;
+      sum1 += piM - last_prime;
+    }
+  }
+
+  for (sieve_start = 0; sieve_start < last_phi_sieve; sieve_start = sieve_end) {
+    /* This phi segment goes from sieve_start to sieve_end. */
+    sieve_end = ((sieve_start + 64*PHI_SIEVE_WORDS) <  last_phi_sieve)
+              ?   sieve_start + 64*PHI_SIEVE_WORDS  :  last_phi_sieve;
+    /* Only divisors s.t. sieve_start <= N / divisor < sieve_end considered. */
+    least_divisor = n / sieve_end;
+    /* Initialize the sieve segment and all associated variables. */
+    phi_total = init_segment(&ss, sieve_start, sieve_end - sieve_start, c, K3, primes);
+
+    /* Step 6:  For c < k < KM:  For 1+M/primes[k+1] <= x <= M, x square-free
+     * and has no factor <= primes[k+1], sum phi(n / (x*primes[k+1]), k). */
+    for (k = c+1; k < KM; k++) {
+      UV pk = primes[k+1];
+      uint32 start = (least_divisor >= pk * U32_CONST(0xFFFFFFFE))
+                   ? U32_CONST(0xFFFFFFFF)
+                   : (least_divisor / pk + 1)/2;
+      phi_total = remove_primes(k, k, &ss, primes);
+      for (j = ss.prime_index[k] - 1; j >= start; j--) {
+        uint32 lpf = factor_table[j];
+        if (lpf > pk) {
+          phi_value = sieve_phi(n / (pk * (2*j+1)));
+          if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value;
+        }
+      }
+      if (start < ss.prime_index[k])
+        ss.prime_index[k] = start;
+    }
+    /* Step 7:  For KM <= K < Pi_M:  For primes[k+2] <= x <= M,
+     * sum phi(n / (x*primes[k+1]), k). */
+    for (; k < step7_max; k++) {
+      phi_total = remove_primes(k, k, &ss, primes);
+      if (ss.prime_index[k] >= k+2) {
+        UV pk = primes[k+1];
+        for (j = ss.prime_index[k]; j >= k+2; j--) {
+          UV divisor = pk * primes[j];
+          if (divisor <= least_divisor) break;
+          sum1 += sieve_phi(n / divisor);
+        }
+        ss.prime_index[k] = j;
+      }
+    }
+    /* Restrict work for the above loop when we know it will be empty. */
+    while (step7_max > KM && ss.prime_index[step7_max-1] < (step7_max-1)+2)
+      step7_max--;
+
+    /* Step 8:  For KM <= K < K3, sum -phi(n / primes[k+1], k) */
+    phi_total = remove_primes(k, K3, &ss, primes);
+    /* Step 9:  For K3 <= k < K2, sum -phi(n / primes[k+1], k) + (k-K3). */
+    while (prime > least_divisor && prime_index >= piM) {
+      sum1 += prime_index - K3;
+      sum2 += sieve_phi(n / prime);
+      prime_index--;
+      prime = prev_sieve_prime(prime);
+    }
+  }
+
+  Safefree(ss.sieve);
+  Safefree(ss.word_count);
+  Safefree(ss.word_count_sum);
+  Safefree(ss.totals);
+  Safefree(ss.prime_index);
+  Safefree(ss.first_bit_index);
+  Safefree(ss.multiplier);
+  Safefree(factor_table);
+  Safefree(primes);
+
+  return sum1 - sum2;
+}
diff --git a/lmo.h b/lmo.h
new file mode 100644
index 0000000..95aa2ff
--- /dev/null
+++ b/lmo.h
@@ -0,0 +1,8 @@
+#ifndef MPU_LMO_H
+#define MPU_LMO_H
+
+#include "ptypes.h"
+
+extern UV _XS_LMO_pi(UV n);
+
+#endif
diff --git a/util.c b/util.c
index 0b17ca4..7a62714 100644
--- a/util.c
+++ b/util.c
@@ -60,7 +60,7 @@
 #include "sieve.h"
 #include "primality.h"
 #include "cache.h"
-#include "lehmer.h"
+#include "lmo.h"
 
 static int _verbose = 0;
 void _XS_set_verbose(int v) { _verbose = v; }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git