[libmath-prime-util-perl] 23/72: Move primality functions to new file. Use Monty routines

Thu May 21 18:49:37 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.32
in repository libmath-prime-util-perl.

commit f246f75cbf60d2086793b6a25c4f5290d879512a
Author: Dana Jacobsen <dana at acm.org>
Date:   Wed Sep 11 15:27:59 2013 -0700

    Move primality functions to new file.  Use Monty routines
---
 .gitignore                        |   1 +
 Changes                           |   9 +-
 MANIFEST                          |   2 +
 Makefile.PL                       |  13 +-
 TODO                              |   9 +-
 XS.xs                             |   3 +
 aks.c                             |   1 -
 factor.c                          | 527 +----------------------------
 factor.h                          |  21 --
 lib/Math/Prime/Util/PrimeArray.pm |   4 +-
 primality.c                       | 689 ++++++++++++++++++++++++++++++++++++++
 primality.h                       |  22 ++
 util.c                            |   2 +-
 util.h                            |  11 +
 xt/primality-small.pl             |   2 +-
 15 files changed, 757 insertions(+), 559 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4681eb4..55be186 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ factor.o
 sieve.o
 util.o
 lehmer.o
+primality.o
 aks.o
 blib*
 b[0-9][0-9][0-9][0-9][0-9][0-9].txt
diff --git a/Changes b/Changes
index 695ed5b..ef8973f 100644
--- a/Changes
+++ b/Changes
@@ -18,7 +18,14 @@ Revision history for Perl module Math::Prime::Util
       Thanks to Kim Walisch for all discussions about this.
 
     - divisor_sum can take a '1' as the second argument as an alias for
-      sub {1}, but works much faster.
+      sub {1}, but works much faster.  (experimental, may change)
+
+    - Incorporate Montgomery reduction for primality testing, thanks to
+      Wojciech Izykowski.  This is a 1.3 to 1.5x speedup for is_prob_prime,
+      is_prime, and is_strong_pseudoprime for numbers > 2^32 on x86_64.
+      This also things like prime_iterator for values > 2^32.
+
+    - Primality functions moved to their own file primality.c.
 
 0.31  2013-08-07
 
diff --git a/MANIFEST b/MANIFEST
index 1b3031f..d16dc1e 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -25,6 +25,8 @@ factor.h
 factor.c
 lehmer.h
 lehmer.c
+primality.h
+primality.c
 sieve.h
 sieve.c
 util.h
diff --git a/Makefile.PL b/Makefile.PL
index b0099c2..2e24249 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -7,12 +7,13 @@ WriteMakefile1(
     LICENSE      => 'perl',
     AUTHOR       => 'Dana A Jacobsen <dana at acm.org>',
 
-    OBJECT       => 'cache.o '  .
-                    'factor.o ' .
-                    'aks.o '    .
-                    'lehmer.o ' .
-                    'sieve.o '  .
-                    'util.o '   .
+    OBJECT       => 'cache.o '    .
+                    'factor.o '   .
+                    'primality.o '.
+                    'aks.o '      .
+                    'lehmer.o '   .
+                    'sieve.o '    .
+                    'util.o '     .
                     'XS.o',
     LIBS         => ['-lm'],
 
diff --git a/TODO b/TODO
index 146dc4c..97bb00f 100644
--- a/TODO
+++ b/TODO
@@ -39,6 +39,11 @@
 - Figure out a way to make the internal FOR_EACH_PRIME macros use a segmented
   sieve.
 
-- Refactor where functions exist in .c.  Lots of primality tests in factor.c.
-
 - Rewrite 23-primality-proofs.t for new format (keep some of the old tests?).
+
+- Use Montgomery routines in more places: Factoring and Lucas tests.
+
+- Add a group order function.  We have a naieve method used in AKS (where the
+  values are so small it doesn't matter).  See:
+  http://cs-haven.blogspot.com/2012/02/efficiently-computing-order-of-element.html
+  for some discussion of different methods.
diff --git a/XS.xs b/XS.xs
index 2150603..84a7ba3 100644
--- a/XS.xs
+++ b/XS.xs
@@ -14,6 +14,7 @@
 #include "cache.h"
 #include "sieve.h"
 #include "util.h"
+#include "primality.h"
 #include "factor.h"
 #include "lehmer.h"
 #include "aks.h"
@@ -484,6 +485,7 @@ _XS_is_prime(IN UV n)
     _XS_is_extra_strong_lucas_pseudoprime = 4
     _XS_is_frobenius_underwood_pseudoprime = 5
     _XS_is_aks_prime = 6
+    _XS_BPSW = 7
   PREINIT:
     int ret;
   CODE:
@@ -495,6 +497,7 @@ _XS_is_prime(IN UV n)
       case 4: ret = _XS_is_lucas_pseudoprime(n, 2); break;
       case 5: ret = _XS_is_frobenius_underwood_pseudoprime(n); break;
       case 6: ret = _XS_is_aks_prime(n); break;
+      case 7: ret = _XS_BPSW(n); break;
       default: croak("_XS_is_prime: Unknown function alias"); break;
     }
     RETVAL = ret;
diff --git a/aks.c b/aks.c
index ba86e69..119d55b 100644
--- a/aks.c
+++ b/aks.c
@@ -28,7 +28,6 @@
 #include "aks.h"
 #include "util.h"
 #include "sieve.h"
-#include "factor.h"
 #include "cache.h"
 #include "mulmod.h"
 
diff --git a/factor.c b/factor.c
index b7166de..e43f335 100644
--- a/factor.c
+++ b/factor.c
@@ -3,13 +3,14 @@
 #include <string.h>
 #include <math.h>
 
-#define FUNC_gcd_ui
 #include "ptypes.h"
 #include "factor.h"
-#include "util.h"
 #include "sieve.h"
 #include "mulmod.h"
 #include "cache.h"
+#include "primality.h"
+#define FUNC_gcd_ui
+#include "util.h"
 
 /*
  * You need to remember to use UV for unsigned and IV for signed types that
@@ -289,396 +290,6 @@ static int is_perfect_square(UV n, UV* sqrtn)
   return 1;
 }
 
-static int jacobi_iu(IV in, UV m) {
-  int j = 1;
-  UV n = (in < 0) ? -in : in;
-
-  if (m <= 0 || (m%2) == 0) return 0;
-  if (in < 0 && (m%4) == 3) j = -j;
-  while (n != 0) {
-    while ((n % 2) == 0) {
-      n >>= 1;
-      if ( (m % 8) == 3 || (m % 8) == 5 )  j = -j;
-    }
-    { UV t = n; n = m; m = t; }
-    if ( (n % 4) == 3 && (m % 4) == 3 )  j = -j;
-    n = n % m;
-  }
-  return (m == 1) ? j : 0;
-}
-
-
-/* Fermat pseudoprime */
-int _XS_is_pseudoprime(UV n, UV a)
-{
-  UV x;
-  UV const nm1 = n-1;
-
-  if (n == 2 || n == 3)  return 1;
-  if (n < 5) return 0;
-  if (a < 2) croak("Base %"UVuf" is invalid", a);
-  if (a >= n) {
-    a %= n;
-    if ( a <= 1 || a == nm1 )
-      return 1;
-  }
-
-  x = powmod(a, nm1, n);    /* x = a^(n-1) mod n */
-  return (x == 1);
-}
-
-
-/* Miller-Rabin probabilistic primality test
- * Returns 1 if probably prime relative to the bases, 0 if composite.
- * Bases must be between 2 and n-2
- */
-int _XS_miller_rabin(UV n, const UV *bases, int nbases)
-{
-  UV const nm1 = n-1;
-  UV d = n-1;
-  int b, r, s = 0;
-
-  MPUassert(n > 3, "MR called with n <= 3");
-
-  while ( (d&1) == 0 ) {
-    s++;
-    d >>= 1;
-  }
-  for (b = 0; b < nbases; b++) {
-    UV x, a = bases[b];
-
-    if (a < 2)
-      croak("Base %"UVuf" is invalid", a);
-    if (a >= n)
-      a %= n;
-    if ( (a <= 1) || (a == nm1) )
-      continue;
-
-    /* n is a strong pseudoprime to this base if either
-     *   -  a^d = 1 mod n
-     *   -  a^(d2^r) = -1 mod n for some r: 0 <= r <= s-1
-     */
-
-    x = powmod(a, d, n);
-    if ( (x == 1) || (x == nm1) )  continue;
-
-    /* cover r = 1 to s-1, r=0 was just done */
-    for (r = 1; r < s; r++) {
-      x = sqrmod(x, n);
-      if ( x == nm1 )  break;
-      if ( x == 1   )  return 0;
-    }
-    if (r >= s)
-      return 0;
-  }
-  return 1;
-}
-
-/* M-R with a = 2 and some checks removed.  For internal use. */
-int _SPRP2(UV n)
-{
-  UV const nm1 = n-1;
-  UV d = n-1;
-  UV x;
-  int r, s = 0;
-
-  MPUassert(n > 3, "S-PRP-2 called with n <= 3");
-  if (!(n & 1)) return 0;
-  while ( (d & 1) == 0 ) {  s++;  d >>= 1; }
-  /* n is a strong pseudoprime to this base if either
-   *   -  a^d = 1 mod n
-   *   -  a^(d2^r) = -1 mod n for some r: 0 <= r <= s-1 */
-  x = powmod(2, d, n);
-  if (x == 1 || x == nm1)  return 1;
-
-  /* just did r=0, now test r = 1 to s-1 */
-  for (r = 1; r < s; r++) {
-    x = sqrmod(x, n);
-    if (x == nm1)  return 1;
-  }
-  return 0;
-}
-
-/* Select M-R bases from http://miller-rabin.appspot.com/, 27 May 2013 */
-#if BITS_PER_WORD == 32
-static const UV mr_bases_small_2[2] = {31, 73};
-static const UV mr_bases_small_3[3] = {2, 7, 61};
-#else
-static const UV mr_bases_large_1[1] = { UVCONST(  9345883071009581737 ) };
-static const UV mr_bases_large_2[2] = { UVCONST(         336781006125 ),
-                                        UVCONST(     9639812373923155 ) };
-static const UV mr_bases_large_3[3] = { UVCONST(  4230279247111683200 ),
-                                        UVCONST( 14694767155120705706 ),
-                                        UVCONST( 16641139526367750375 ) };
-#endif
-
-int _XS_is_prob_prime(UV n)
-{
-  int ret;
-
-  if (n < 11) {
-    if (n == 2 || n == 3 || n == 5 || n == 7)     return 2;
-    else                                          return 0;
-  }
-  if (!(n%2) || !(n%3) || !(n%5) || !(n%7))       return 0;
-  if (n <  121) /* 11*11 */                       return 2;
-  if (!(n%11) || !(n%13) || !(n%17) || !(n%19) ||
-      !(n%23) || !(n%29) || !(n%31) || !(n%37) ||
-      !(n%41) || !(n%43) || !(n%47) || !(n%53))   return 0;
-  if (n < 3481) /* 59*59 */                       return 2;
-
-#if BITS_PER_WORD == 32
-  /* We could use one base when n < 49191, two when n < 360018361. */
-  if (n < UVCONST(9080191))
-    ret = _XS_miller_rabin(n, mr_bases_small_2, 2);
-  else
-    ret = _XS_miller_rabin(n, mr_bases_small_3, 3);
-#else
-  /* AESLSP test costs about 1.5 Selfridges, vs. ~2.2 for strong Lucas.
-   * So it works out to be faster to do AES-BPSW vs. 3 M-R tests. */
-  if (n < UVCONST(341531))
-    ret = _XS_miller_rabin(n, mr_bases_large_1, 1);
-  else if (n < UVCONST(1050535501))
-    ret = _XS_miller_rabin(n, mr_bases_large_2, 2);
-  else
-    ret = _SPRP2(n) && _XS_is_almost_extra_strong_lucas_pseudoprime(n,1);
-#endif
-  return 2*ret;
-}
-
-/* Generic Lucas sequence for any appropriate P and Q */
-void lucas_seq(UV* Uret, UV* Vret, UV* Qkret, UV n, IV P, IV Q, UV k)
-{
-  UV U, V, b, Dmod, Qmod, Pmod, Qk;
-
-  if (k == 0) {
-    *Uret = 0;
-    *Vret = 2;
-    *Qkret = Q;
-    return;
-  }
-
-  Qmod = (Q < 0)  ?  (UV)(Q + (IV)n)  :  (UV)Q;
-  Pmod = (P < 0)  ?  (UV)(P + (IV)n)  :  (UV)P;
-  Dmod = submod( mulmod(Pmod, Pmod, n), mulmod(4, Qmod, n), n);
-  MPUassert(Dmod != 0, "lucas_seq: D is 0");
-  U = 1;
-  V = Pmod;
-  Qk = Qmod;
-  { UV v = k; b = 1; while (v >>= 1) b++; }
-
-  if (Q == 1) {
-    while (b > 1) {
-      U = mulmod(U, V, n);
-      V = mulsubmod(V, V, 2, n);
-      b--;
-      if ( (k >> (b-1)) & UVCONST(1) ) {
-        UV t2 = mulmod(U, Dmod, n);
-        U = muladdmod(U, Pmod, V, n);
-        if (U & 1) { U = (n>>1) + (U>>1) + 1; } else { U >>= 1; }
-        V = muladdmod(V, P, t2, n);
-        if (V & 1) { V = (n>>1) + (V>>1) + 1; } else { V >>= 1; }
-      }
-    }
-  } else if (P == 1 && Q == -1) {
-    /* This is about 30% faster than the generic code below.  Since 50% of
-     * Lucas and strong Lucas tests come here, I think it's worth doing. */
-    int sign = Q;
-    while (b > 1) {
-      U = mulmod(U, V, n);
-      if (sign == 1) V = mulsubmod(V, V, 2, n);
-      else           V = muladdmod(V, V, 2, n);
-      sign = 1;   /* Qk *= Qk */
-      b--;
-      if ( (k >> (b-1)) & UVCONST(1) ) {
-        UV t2 = mulmod(U, Dmod, n);
-        U = addmod(U, V, n);
-        if (U & 1) { U = (n>>1) + (U>>1) + 1; } else { U >>= 1; }
-        V = addmod(V, t2, n);
-        if (V & 1) { V = (n>>1) + (V>>1) + 1; } else { V >>= 1; }
-        sign = -1;  /* Qk *= Q */
-      }
-    }
-    if (sign == 1) Qk = 1;
-  } else {
-    while (b > 1) {
-      U = mulmod(U, V, n);
-      V = mulsubmod(V, V, addmod(Qk,Qk,n), n);
-      Qk = sqrmod(Qk, n);
-      b--;
-      if ( (k >> (b-1)) & UVCONST(1) ) {
-        UV t2 = mulmod(U, Dmod, n);
-        U = muladdmod(U, Pmod, V, n);
-        if (U & 1) { U = (n>>1) + (U>>1) + 1; } else { U >>= 1; }
-        V = muladdmod(V, P, t2, n);
-        if (V & 1) { V = (n>>1) + (V>>1) + 1; } else { V >>= 1; }
-        Qk = mulmod(Qk, Qmod, n);
-      }
-    }
-  }
-  *Uret = U;
-  *Vret = V;
-  *Qkret = Qk;
-}
-
-/* Lucas tests:
- *  0: Standard
- *  1: Strong
- *  2: Extra Strong (Mo/Jones/Grantham)
- *
- * Goal:
- *       (1) no false results when combined with the SPRP-2 test.
- *       (2) fast enough to use SPRP-2 + this in place of 3+ M-R tests.
- *
- * For internal purposes, we typically want to use the extra strong test
- * because it is slightly faster (Q = 1 simplies things).  None of them have
- * any false positives for the BPSW test.
- *
- * This runs 4-7x faster than MPU::GMP, which means ~10x faster than most GMP
- * implementations.  It is about 2x slower than a single M-R test.
- */
-int _XS_is_lucas_pseudoprime(UV n, int strength)
-{
-  IV P, Q, D;
-  UV U, V, Qk, d, s;
-
-  if (n == 2 || n == 3) return 1;
-  if (n < 5 || (n%2) == 0) return 0;
-  if (n == UV_MAX) return 0;
-
-  if (strength < 2) {
-    UV Du = 5;
-    IV sign = 1;
-    while (1) {
-      D = Du * sign;
-      if (gcd_ui(Du, n) > 1 && gcd_ui(Du, n) != n) return 0;
-      if (jacobi_iu(D, n) == -1)
-        break;
-      if (Du == 21 && is_perfect_square(n, 0)) return 0;
-      Du += 2;
-      sign = -sign;
-    }
-    P = 1;
-    Q = (1 - D) / 4;
-  } else {
-    P = 3;
-    Q = 1;
-    while (1) {
-      D = P*P - 4;
-      if (gcd_ui(D, n) > 1 && gcd_ui(D, n) != n) return 0;
-      if (jacobi_iu(D, n) == -1)
-        break;
-      if (P == 21 && is_perfect_square(n, 0)) return 0;
-      P++;
-    }
-  }
-  MPUassert( D == (P*P - 4*Q) , "is_lucas_pseudoprime: incorrect DPQ");
-
-  d = n+1;
-  s = 0;
-  if (strength > 0)
-    while ( (d & 1) == 0 ) {  s++;  d >>= 1; }
-  lucas_seq(&U, &V, &Qk, n, P, Q, d);
-
-  if (strength == 0) {
-    if (U == 0)
-      return 1;
-  } else if (strength == 1) {
-    if (U == 0)
-      return 1;
-    /* Now check to see if V_{d*2^r} == 0 for any 0 <= r < s */
-    while (s--) {
-      if (V == 0)
-        return 1;
-      if (s) {
-        V = mulsubmod(V, V, addmod(Qk,Qk,n), n);
-        Qk = sqrmod(Qk, n);
-      }
-    }
-  } else {
-    if ( U == 0 && (V == 2 || V == (n-2)) )
-      return 1;
-    /* Now check to see if V_{d*2^r} == 0 for any 0 <= r < s-1 */
-    s--;
-    while (s--) {
-      if (V == 0)
-        return 1;
-      if (s)
-        V = mulsubmod(V, V, 2, n);
-    }
-  }
-  return 0;
-}
-
-/* A generalization of Pari's shortcut to the extra-strong Lucas test.
- * I've added a gcd check at the top, which needs to be done and also results
- * in fewer pseudoprimes.  Pari always does trial division to 100 first so
- * is unlikely to come up there.  This only calculate V, which can be done
- * faster, but that means we have more pseudoprimes than the standard
- * extra-strong test.
- *
- * increment:  1 for Baillie OEIS, 2 for Pari.
- *
- * With increment = 1, these results will be a subset of the extra-strong
- * Lucas pseudoprimes.  With increment = 2, we produce Pari's results.
- */
-int _XS_is_almost_extra_strong_lucas_pseudoprime(UV n, UV increment)
-{
-  UV P, V, d, s;
-
-  if (n == 2 || n == 3 || n == 5) return 1;
-  if (n < 7 || (n%2) == 0) return 0;
-  if (n == UV_MAX) return 0;
-  if (increment < 1 || increment > 256)
-    croak("Invalid lucas paramater increment: %"UVuf"\n", increment);
-
-  P = 3;
-  while (1) {
-    UV D = P*P - 4;
-    d = gcd_ui(D, n);
-    if (d > 1 && d < n)
-      return 0;
-    if (jacobi_iu(D, n) == -1)
-      break;
-    if (P == (3+20*increment) && is_perfect_square(n, 0)) return 0;
-    P += increment;
-    if (P > 65535)
-      croak("lucas_extrastrong_params: P exceeded 65535");
-  }
-  if (P >= n)  P %= n;   /* Never happens with increment < 4 */
-
-  d = n+1;
-  s = 0;
-  while ( (d & 1) == 0 ) {  s++;  d >>= 1; }
-
-  {
-    UV W, b;
-    V = P;
-    W = mulsubmod(P, P, 2, n);
-    { UV v = d; b = 1; while (v >>= 1) b++; }
-    while (b-- > 1) {
-      if ( (d >> (b-1)) & UVCONST(1) ) {
-        V = mulsubmod(V, W, P, n);
-        W = mulsubmod(W, W, 2, n);
-      } else {
-        W = mulsubmod(V, W, P, n);
-        V = mulsubmod(V, V, 2, n);
-      }
-    }
-  }
-
-  if (V == 2 || V == (n-2))
-    return 1;
-  while (s-- > 1) {
-    if (V == 0)
-      return 1;
-    V = mulsubmod(V, V, 2, n);
-    if (V == 2)
-      return 0;
-  }
-  return 0;
-}
-
 
 UV _XS_divisor_sum(UV n)
 {
@@ -1378,135 +989,3 @@ int racing_squfof_factor(UV n, UV *factors, UV rounds)
   factors[0] = n;
   return 1;
 }
-
-
-/****************************************************************************/
-
-/*
- *
- * The Frobenius-Underwood test has no known counterexamples below 10^13, but
- * has not been extensively tested above that.  This is the Minimal Lambda+2
- * test from section 9 of "Quadratic Composite Tests" by Paul Underwood.
- *
- * Given the script:
- *  time mpu 'forprimes { Math::Prime::Util::_XS_is_frobenius_underwood_pseudoprime($_); Math::Prime::Util::_XS_is_frobenius_underwood_pseudoprime($_+2); } 500_000_000'
- * and replacing the tests appropriately, I get these times:
- *
- *    0.87    $_ (cost of empty loop)
- *   21.37    _XS_is_pseudoprime($_,2)
- *   22.42    _XS_miller_rabin($_,2)
- *   44.53    _XS_is_lucas_pseudoprime($_)
- *   43.95    _XS_is_strong_lucas_pseudoprime($_)
- *   40.09    _XS_is_extra_strong_lucas_pseudoprime($_)
- *   25.86    _XS_is_almost_extra_strong_lucas_pseudoprime($_)
- *   42.40    _XS_is_frobenius_underwood_pseudoprime($_)
- *   27.02    _XS_is_prob_prime($_)
- *   27.24    _XS_is_prime($_)
- *
- * At these sizes is_prob_prime is doing 1-2 M-R tests.  The input validation
- * is adding a noticeable overhead to is_prime.
- *
- * With a set of 100k 64-bit random primes; 'do { die unless ... } for 1..50'
- *
- *   0.32    empty loop
- *  10.25    _XS_is_pseudoprime($_,2)
- *  10.06    _XS_miller_rabin($_,2)
- *  22.02    _XS_is_lucas_pseudoprime($_)
- *  21.81    _XS_is_strong_lucas_pseudoprime($_)
- *  20.99    _XS_is_extra_strong_lucas_pseudoprime($_)
- *  14.01    _XS_is_almost_extra_strong_lucas_pseudoprime($_)
- *  18.44    _XS_is_frobenius_underwood_pseudoprime($_)
- *  24.11    _XS_is_prob_prime($_)
- *  24.06    _XS_is_prime($_)
- *
- * At this point is_prob_prime has transitioned to BPSW.
- *
- * Calling a powmod a 'Selfridge' unit, then we see:
- *    1   Selfridge unit    M-R test
- *    1.4 Selfridge unit    "almost extra strong" Lucas
- *    2   Selfridge units   Lucas or Frobenius-Underwood
- *    3   Selfridge units   BPSW (standard, strong, or extra-strong)
- *
- * We try to structure the primality test like:
- *   1) simple divisibility    very fast       primes and ~10% of composites
- *   2) M-R with base 2        1 Selfridge     primes and .00000000002% comps
- *   3) Lucas test             2 Selfridge     only primes
- *
- * Hence given a random composite, about 90% of the time it costs us almost
- * nothing.  After spending 1 Selfridge on the first MR test, less than 32M
- * composites remain undecided out of 18 quintillion 64-bit composites.  The
- * final Lucas test has no false positives.
- * Replacing the Lucas test with the F-U test won't save any time.  Replacing
- * the whole thing with the F-U test (assuming it has no false results for
- * all 64-bit values, which hasn't been verified), doesn't help either.
- * It's 2/3 the cost for primes, but much more expensive for composites.  It
- * seems of interest for > 2^64 as a different test to do in addition to BPSW.
- */
-
-
-int _XS_is_frobenius_underwood_pseudoprime(UV n)
-{
-  int bit;
-  UV x, result, multiplier, a, b, np1, len, t1, t2, na;
-  IV t;
-
-  if (n < 2) return 0;
-  if (n < 4) return 1;
-  if ((n % 2) == 0) return 0;
-  if (is_perfect_square(n,0)) return 0;
-  if (n == UV_MAX) return 0;
-
-  x = 0;
-  t = -1;
-  while ( jacobi_iu( t, n ) != -1 ) {
-    x++;
-    t = (IV)(x*x) - 4;
-  }
-  result = addmod( addmod(x, x, n), 5, n);
-  multiplier = addmod(x, 2, n);
-
-  a = 1;
-  b = 2;
-  np1 = n+1;
-  { UV v = np1; len = 1;  while (v >>= 1) len++; }
-
-  if (x == 0) {
-    for (bit = len-2; bit >= 0; bit--) {
-      t2 = addmod(b, b, n);
-      na = mulmod(a, t2, n);
-      t1 = addmod(b, a, n);
-      t2 = submod(b, a, n);
-      b = mulmod(t1, t2, n);
-      a = na;
-      if ( (np1 >> bit) & UVCONST(1) ) {
-        t1 = mulmod(a, 2, n);
-        na = addmod(t1, b, n);
-        t1 = addmod(b, b, n);
-        b = submod(t1, a, n);
-        a = na;
-      }
-    }
-  } else {
-    for (bit = len-2; bit >= 0; bit--) {
-      t1 = mulmod(a, x, n);
-      t2 = addmod(b, b, n);
-      t1 = addmod(t1, t2, n);
-      na = mulmod(a, t1, n);
-      t1 = addmod(b, a, n);
-      t2 = submod(b, a, n);
-      b = mulmod(t1, t2, n);
-      a = na;
-      if ( (np1 >> bit) & UVCONST(1) ) {
-        t1 = mulmod(a, multiplier, n);
-        na = addmod(t1, b, n);
-        t1 = addmod(b, b, n);
-        b = submod(t1, a, n);
-        a = na;
-      }
-    }
-  }
-  if (_XS_get_verbose()>1) printf("%"UVuf" is %s with x = %"UVuf"\n", n, (a == 0 && b == result) ? "probably prime" : "composite", x);
-  if (a == 0 && b == result)
-    return 1;
-  return 0;
-}
diff --git a/factor.h b/factor.h
index c7dd3ae..2979b32 100644
--- a/factor.h
+++ b/factor.h
@@ -18,27 +18,6 @@ extern int pplus1_factor(UV n, UV *factors, UV B);
 extern int squfof_factor(UV n, UV *factors, UV rounds);
 extern int racing_squfof_factor(UV n, UV *factors, UV rounds);
 
-
-extern int _XS_is_pseudoprime(UV n, UV a);
-extern int _XS_miller_rabin(UV n, const UV *bases, int nbases);
-extern int _SPRP2(UV n);
-extern int _XS_is_prob_prime(UV n);
-extern void lucas_seq(UV* U, UV* V, UV* Qk,  UV n, IV P, IV Q, UV k);
-extern int _XS_is_lucas_pseudoprime(UV n, int strength);
-extern int _XS_is_frobenius_underwood_pseudoprime(UV n);
-extern int _XS_is_almost_extra_strong_lucas_pseudoprime(UV n, UV increment);
-
 extern UV _XS_divisor_sum(UV n);
 
-#ifdef FUNC_gcd_ui
-static UV gcd_ui(UV x, UV y) {
-  UV t;
-  if (y < x) { t = x; x = y; y = t; }
-  while (y > 0) {
-    t = y;  y = x % y;  x = t;  /* y1 <- x0 % y0 ; x1 <- y0 */
-  }
-  return x;
-}
-#endif
-
 #endif
diff --git a/lib/Math/Prime/Util/PrimeArray.pm b/lib/Math/Prime/Util/PrimeArray.pm
index baf7815..a47795a 100644
--- a/lib/Math/Prime/Util/PrimeArray.pm
+++ b/lib/Math/Prime/Util/PrimeArray.pm
@@ -199,8 +199,8 @@ Example:
 
 If you want sequential primes with low memory, I recommend using
 L<Math::Prime::Util/forprimes>.  It is much faster, as the tied array
-functionality in Perl is not high performance.  It does have limitations
-vs. the prime array, but many tasks find they can use it.
+functionality in Perl is not high performance.  It isn't as flexible as
+the prime array, but it is a very common pattern.
 
 If you prefer an iterator pattern, I would recommend using
 L<Math::Prime::Util/prime_iterator>.  It will be a bit faster than using this
diff --git a/primality.c b/primality.c
new file mode 100644
index 0000000..171317d
--- /dev/null
+++ b/primality.c
@@ -0,0 +1,689 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "ptypes.h"
+#include "primality.h"
+#include "mulmod.h"
+#define FUNC_gcd_ui
+#include "util.h"
+
+/* Primality related functions, including Montgomery math */
+
+/* TODO:
+ *      - Convert F-U test to Montgomery
+ *      - Convert Lucas tests to Montgomery
+ */
+
+static const UV mr_bases_const2[1] = {2};
+
+/******************************************************************************
+  Code inside USE_MONT_PRIMALITY is Montgomery math and efficient M-R from
+  Wojciech Izykowski.  See:  https://github.com/wizykowski/miller-rabin
+
+Copyright (c) 2013, Wojciech Izykowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * The name of the author may not be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************/
+#if USE_MONT_PRIMALITY
+static INLINE uint64_t mont_prod64(uint64_t a, uint64_t b, uint64_t n, uint64_t npi)
+{
+  uint64_t t_hi, t_lo, m, mn_hi, mn_lo, u;
+  int carry;
+  /* t_hi * 2^64 + t_lo = a*b */
+  asm("mulq %3" : "=a"(t_lo), "=d"(t_hi) : "a"(a), "rm"(b));
+  m = t_lo * npi;
+  /* mn_hi * 2^64 + mn_lo = m*n */
+  asm("mulq %3" : "=a"(mn_lo), "=d"(mn_hi) : "a"(m), "rm"(n));
+  carry = t_lo + mn_lo < t_lo ? 1 : 0;
+  u = t_hi + mn_hi + carry;
+  if (u < t_hi) return u-n;
+  return u >= n ? u-n : u;
+}
+#define mont_square64(a, n, npi)  mont_prod64(a, a, n, npi)
+static INLINE UV mont_powmod64(uint64_t a, uint64_t k, uint64_t one, uint64_t n, uint64_t npi)
+{
+  uint64_t t = one;
+  while (k) {
+    if (k & 1) t = mont_prod64(t, a, n, npi);
+    k >>= 1;
+    if (k)     a = mont_square64(a, n, npi);
+  }
+  return t;
+}
+static INLINE uint64_t modular_inverse64(const uint64_t a)
+{
+  uint64_t u,x,w,z,q;
+
+  x = 1; z = a;
+
+  q = (-z)/z + 1;  /* = 2^64 / z                 */
+  u = - q;         /* = -q * x                   */
+  w = - q * z;     /* = b - q * z = 2^64 - q * z */
+
+  /* after first iteration all variables are 64-bit */
+
+  while (w) {
+    if (w < z) {
+      q = u; u = x; x = q; /* swap(u, x) */
+      q = w; w = z; z = q; /* swap(w, z) */
+    }
+    q = w / z;
+    u -= q * x;
+    w -= q * z;
+  }
+  return x;
+}
+static INLINE uint64_t compute_modn64(const uint64_t n)
+{
+  
+  if (n <= (1ULL << 63)) {
+    uint64_t res = ((1ULL << 63) % n) << 1;
+    return res < n ? res : res-n;
+  } else
+    return -n;
+}
+#define compute_a_times_2_64_mod_n(a, n, r)   mulmod(a, r, n)
+static INLINE uint64_t compute_2_65_mod_n(const uint64_t n, const uint64_t modn)
+{
+  if (n <= (1ULL << 63)) {
+    uint64_t res = modn << 1;
+    return res < n ? res : res - n;
+  } else {
+    /* n can fit 2 or 3 times in 2^65 */
+    if (n > UVCONST(12297829382473034410))
+      return -n-n;    /* 2^65 mod n = 2^65 - 2*n */
+    else
+      return -n-n-n;  /* 2^65 mod n = 2^65 - 3*n */
+  }
+}
+/* static INLINE int efficient_mr64(const uint64_t bases[], const int cnt, const uint64_t n) */
+static int monty_mr64(const uint64_t n, const UV* bases, int cnt)
+{
+  int i, j, t;
+  const uint64_t npi = modular_inverse64(-((int64_t)n));
+  const uint64_t r = compute_modn64(n);
+  uint64_t u = n - 1;
+  const uint64_t nr = n - r;
+
+  t = 0;
+  while (!(u&1)) {  t++;  u >>= 1;  }
+  for (j = 0; j < cnt; j++) {
+    const uint64_t a = bases[j];
+    uint64_t       A = compute_a_times_2_64_mod_n(a, n, r);
+    uint64_t       d;
+    if (a < 2)  croak("Base %"UVuf" is invalid", (UV)a);
+    if (!A) continue;  /* PRIME in subtest */
+    d = mont_powmod64(A, u, r, n, npi);  /* compute a^u mod n */
+    if (d == r || d == nr) continue;  /* PRIME in subtest */
+    for (i=1; i<t; i++) {
+      d = mont_square64(d, n, npi);
+      if (d == r) return 0;
+      if (d == nr) break;  /* PRIME in subtest */
+    }
+    if (i == t) return 0;
+  }
+  return 1;
+}
+#endif
+/******************************************************************************/
+
+
+
+
+/* Helper functions */
+static int is_perfect_square(UV n, UV* sqrtn)
+{
+  UV m;
+  m = n & 127;
+  if ((m*0x8bc40d7d) & (m*0xa1e2f5d1) & 0x14020a)  return 0;
+  m = isqrt(n);
+  if (n != (m*m))
+    return 0;
+  if (sqrtn != 0)
+    *sqrtn = m;
+  return 1;
+}
+static int jacobi_iu(IV in, UV m) {
+  int j = 1;
+  UV n = (in < 0) ? -in : in;
+
+  if (m <= 0 || (m%2) == 0) return 0;
+  if (in < 0 && (m%4) == 3) j = -j;
+  while (n != 0) {
+    while ((n % 2) == 0) {
+      n >>= 1;
+      if ( (m % 8) == 3 || (m % 8) == 5 )  j = -j;
+    }
+    { UV t = n; n = m; m = t; }
+    if ( (n % 4) == 3 && (m % 4) == 3 )  j = -j;
+    n = n % m;
+  }
+  return (m == 1) ? j : 0;
+}
+
+
+
+
+/* Fermat pseudoprime */
+int _XS_is_pseudoprime(UV const n, UV a)
+{
+  UV x;
+
+  if (n == 2 || n == 3)  return 1;
+  if (n < 5) return 0;
+  if (a < 2) croak("Base %"UVuf" is invalid", a);
+  if (a >= n) {
+    a %= n;
+    if ( a <= 1 || a == n-1 )
+      return 1;
+  }
+  x = powmod(a, n-1, n);    /* x = a^(n-1) mod n */
+  return (x == 1);
+}
+
+/* Miller-Rabin probabilistic primality test
+ * Returns 1 if probably prime relative to the bases, 0 if composite.
+ * Bases must be between 2 and n-2
+ */
+int _XS_miller_rabin(UV const n, const UV *bases, int nbases)
+{
+  UV d = n-1;
+  int b, r, s = 0;
+
+  MPUassert(n > 3, "MR called with n <= 3");
+  if ((n & 1) == 0) return 0;
+
+  if (USE_MONT_PRIMALITY && n >= UVCONST(4294967295))
+    return monty_mr64((uint64_t)n, bases, nbases);
+
+  while (!(d&1)) {  s++;  d >>= 1;  }
+
+  for (b = 0; b < nbases; b++) {
+    UV x, a = bases[b];
+    if (a < 2)  croak("Base %"UVuf" is invalid", a);
+    if (a >= n) a %= n;
+    if ( (a <= 1) || (a == n-1) )
+      continue;
+    /* n is a strong pseudoprime to base a if either:
+     *    a^d = 1 mod n
+     *    a^(d2^r) = -1 mod n for some r: 0 <= r <= s-1
+     */
+    x = powmod(a, d, n);
+    if ( (x == 1) || (x == n-1) )  continue;
+    for (r = 1; r < s; r++) {  /* r=0 was just done, test r = 1 to s-1 */
+      x = sqrmod(x, n);
+      if ( x == n-1 )  break;
+      if ( x == 1   )  return 0;
+    }
+    if (r >= s)  return 0;
+  }
+  return 1;
+}
+
+int _XS_BPSW(UV const n)
+{
+  if (n == 2 || n == 3 || n == 5) return 1;
+  if (n < 7 || (n%2) == 0 || n == UV_MAX) return 0;
+
+#if !USE_MONT_PRIMALITY
+  return    _XS_miller_rabin(n, mr_bases_const2, 1)
+         && _XS_is_almost_extra_strong_lucas_pseudoprime(n,1);
+#else
+  if (n < UVCONST(4294967295)) {
+    return    _XS_miller_rabin(n, mr_bases_const2, 1)
+           && _XS_is_almost_extra_strong_lucas_pseudoprime(n,1);
+  } else {
+    const uint64_t npi = modular_inverse64(-((int64_t)n));
+    const uint64_t montr = compute_modn64(n);
+    const uint64_t mont2 = compute_2_65_mod_n(n, montr);
+    uint64_t u = n-1;
+    const uint64_t nr = n-montr;
+    int i, t = 0;
+    UV P, V, d, s;
+
+    /* M-R with base 2 */
+    while (!(u&1)) {  t++;  u >>= 1;  }
+    {
+      uint64_t A = mont2;
+      if (A) {
+        uint64_t d = mont_powmod64(A, u, montr, n, npi);
+        if (d != montr && d != nr) {
+          for (i=1; i<t; i++) {
+            d = mont_square64(d, n, npi);
+            if (d == montr) return 0;
+            if (d == nr) break;
+          }
+          if (i == t)
+            return 0;
+        }
+      }
+    }
+    /* AES Lucas test */
+    P = 3;
+    while (1) {
+      UV D = P*P - 4;
+      d = gcd_ui(D, n);
+      if (d > 1 && d < n)
+        return 0;
+      if (jacobi_iu(D, n) == -1)
+        break;
+      if (P == (3+20) && is_perfect_square(n, 0)) return 0;
+      P++;
+      if (P > 65535)
+        croak("lucas_extrastrong_params: P exceeded 65535");
+    }
+
+    d = n+1;
+    s = 0;
+    while ( (d & 1) == 0 ) {  s++;  d >>= 1; }
+
+    {
+      const uint64_t montP = compute_a_times_2_64_mod_n(P, n, montr);
+      UV W, b;
+      W = submod(  mont_prod64( montP, montP, n, npi),  mont2, n);
+      V = montP;
+      { UV v = d; b = 1; while (v >>= 1) b++; }
+      while (b-- > 1) {
+        if ( (d >> (b-1)) & UVCONST(1) ) {
+          V = submod(  mont_prod64(V, W, n, npi),  montP, n);
+          W = submod(  mont_prod64(W, W, n, npi),  mont2, n);
+        } else {
+          W = submod(  mont_prod64(V, W, n, npi),  montP, n);
+          V = submod(  mont_prod64(V, V, n, npi),  mont2, n);
+        }
+      }
+    }
+
+    if (V == mont2 || V == (n-mont2))
+      return 1;
+    while (s-- > 1) {
+      if (V == 0)
+        return 1;
+      V = submod(  mont_prod64(V, V, n, npi),  mont2, n);
+      if (V == mont2)
+        return 0;
+    }
+  }
+  return 0;
+#endif
+}
+
+/* Generic Lucas sequence for any appropriate P and Q */
+void lucas_seq(UV* Uret, UV* Vret, UV* Qkret, UV n, IV P, IV Q, UV k)
+{
+  UV U, V, b, Dmod, Qmod, Pmod, Qk;
+
+  if (k == 0) {
+    *Uret = 0;
+    *Vret = 2;
+    *Qkret = Q;
+    return;
+  }
+
+  Qmod = (Q < 0)  ?  (UV)(Q + (IV)n)  :  (UV)Q;
+  Pmod = (P < 0)  ?  (UV)(P + (IV)n)  :  (UV)P;
+  Dmod = submod( mulmod(Pmod, Pmod, n), mulmod(4, Qmod, n), n);
+  MPUassert(Dmod != 0, "lucas_seq: D is 0");
+  U = 1;
+  V = Pmod;
+  Qk = Qmod;
+  { UV v = k; b = 1; while (v >>= 1) b++; }
+
+  if (Q == 1) {
+    while (b > 1) {
+      U = mulmod(U, V, n);
+      V = mulsubmod(V, V, 2, n);
+      b--;
+      if ( (k >> (b-1)) & UVCONST(1) ) {
+        UV t2 = mulmod(U, Dmod, n);
+        U = muladdmod(U, Pmod, V, n);
+        if (U & 1) { U = (n>>1) + (U>>1) + 1; } else { U >>= 1; }
+        V = muladdmod(V, P, t2, n);
+        if (V & 1) { V = (n>>1) + (V>>1) + 1; } else { V >>= 1; }
+      }
+    }
+  } else if (P == 1 && Q == -1) {
+    /* This is about 30% faster than the generic code below.  Since 50% of
+     * Lucas and strong Lucas tests come here, I think it's worth doing. */
+    int sign = Q;
+    while (b > 1) {
+      U = mulmod(U, V, n);
+      if (sign == 1) V = mulsubmod(V, V, 2, n);
+      else           V = muladdmod(V, V, 2, n);
+      sign = 1;   /* Qk *= Qk */
+      b--;
+      if ( (k >> (b-1)) & UVCONST(1) ) {
+        UV t2 = mulmod(U, Dmod, n);
+        U = addmod(U, V, n);
+        if (U & 1) { U = (n>>1) + (U>>1) + 1; } else { U >>= 1; }
+        V = addmod(V, t2, n);
+        if (V & 1) { V = (n>>1) + (V>>1) + 1; } else { V >>= 1; }
+        sign = -1;  /* Qk *= Q */
+      }
+    }
+    if (sign == 1) Qk = 1;
+  } else {
+    while (b > 1) {
+      U = mulmod(U, V, n);
+      V = mulsubmod(V, V, addmod(Qk,Qk,n), n);
+      Qk = sqrmod(Qk, n);
+      b--;
+      if ( (k >> (b-1)) & UVCONST(1) ) {
+        UV t2 = mulmod(U, Dmod, n);
+        U = muladdmod(U, Pmod, V, n);
+        if (U & 1) { U = (n>>1) + (U>>1) + 1; } else { U >>= 1; }
+        V = muladdmod(V, P, t2, n);
+        if (V & 1) { V = (n>>1) + (V>>1) + 1; } else { V >>= 1; }
+        Qk = mulmod(Qk, Qmod, n);
+      }
+    }
+  }
+  *Uret = U;
+  *Vret = V;
+  *Qkret = Qk;
+}
+
+/* Lucas tests:
+ *  0: Standard
+ *  1: Strong
+ *  2: Extra Strong (Mo/Jones/Grantham)
+ *
+ * None of them have any false positives for the BPSW test.  Also see the
+ * "almost extra strong" test.
+ */
+int _XS_is_lucas_pseudoprime(UV n, int strength)
+{
+  IV P, Q, D;
+  UV U, V, Qk, d, s;
+
+  if (n == 2 || n == 3) return 1;
+  if (n < 5 || (n%2) == 0) return 0;
+  if (n == UV_MAX) return 0;
+
+  if (strength < 2) {
+    UV Du = 5;
+    IV sign = 1;
+    while (1) {
+      D = Du * sign;
+      if (gcd_ui(Du, n) > 1 && gcd_ui(Du, n) != n) return 0;
+      if (jacobi_iu(D, n) == -1)
+        break;
+      if (Du == 21 && is_perfect_square(n, 0)) return 0;
+      Du += 2;
+      sign = -sign;
+    }
+    P = 1;
+    Q = (1 - D) / 4;
+  } else {
+    P = 3;
+    Q = 1;
+    while (1) {
+      D = P*P - 4;
+      if (gcd_ui(D, n) > 1 && gcd_ui(D, n) != n) return 0;
+      if (jacobi_iu(D, n) == -1)
+        break;
+      if (P == 21 && is_perfect_square(n, 0)) return 0;
+      P++;
+    }
+  }
+  MPUassert( D == (P*P - 4*Q) , "is_lucas_pseudoprime: incorrect DPQ");
+
+  d = n+1;
+  s = 0;
+  if (strength > 0)
+    while ( (d & 1) == 0 ) {  s++;  d >>= 1; }
+  lucas_seq(&U, &V, &Qk, n, P, Q, d);
+
+  if (strength == 0) {
+    if (U == 0)
+      return 1;
+  } else if (strength == 1) {
+    if (U == 0)
+      return 1;
+    /* Now check to see if V_{d*2^r} == 0 for any 0 <= r < s */
+    while (s--) {
+      if (V == 0)
+        return 1;
+      if (s) {
+        V = mulsubmod(V, V, addmod(Qk,Qk,n), n);
+        Qk = sqrmod(Qk, n);
+      }
+    }
+  } else {
+    if ( U == 0 && (V == 2 || V == (n-2)) )
+      return 1;
+    /* Now check to see if V_{d*2^r} == 0 for any 0 <= r < s-1 */
+    s--;
+    while (s--) {
+      if (V == 0)
+        return 1;
+      if (s)
+        V = mulsubmod(V, V, 2, n);
+    }
+  }
+  return 0;
+}
+
+/* A generalization of Pari's shortcut to the extra-strong Lucas test.
+ * I've added a gcd check at the top, which needs to be done and also results
+ * in fewer pseudoprimes.  Pari always does trial division to 100 first so
+ * is unlikely to come up there.  This only calculate V, which can be done
+ * faster, but that means we have more pseudoprimes than the standard
+ * extra-strong test.
+ *
+ * increment:  1 for Baillie OEIS, 2 for Pari.
+ *
+ * With increment = 1, these results will be a subset of the extra-strong
+ * Lucas pseudoprimes.  With increment = 2, we produce Pari's results.
+ */
+int _XS_is_almost_extra_strong_lucas_pseudoprime(UV n, UV increment)
+{
+  UV P, V, d, s;
+
+  if (n == 2 || n == 3 || n == 5) return 1;
+  if (n < 7 || (n%2) == 0) return 0;
+  if (n == UV_MAX) return 0;
+  if (increment < 1 || increment > 256)
+    croak("Invalid lucas paramater increment: %"UVuf"\n", increment);
+
+  P = 3;
+  while (1) {
+    UV D = P*P - 4;
+    d = gcd_ui(D, n);
+    if (d > 1 && d < n)
+      return 0;
+    if (jacobi_iu(D, n) == -1)
+      break;
+    if (P == (3+20*increment) && is_perfect_square(n, 0)) return 0;
+    P += increment;
+    if (P > 65535)
+      croak("lucas_extrastrong_params: P exceeded 65535");
+  }
+  if (P >= n)  P %= n;   /* Never happens with increment < 4 */
+
+  d = n+1;
+  s = 0;
+  while ( (d & 1) == 0 ) {  s++;  d >>= 1; }
+
+  {
+    UV W, b;
+    V = P;
+    W = mulsubmod(P, P, 2, n);
+    { UV v = d; b = 1; while (v >>= 1) b++; }
+    while (b-- > 1) {
+      if ( (d >> (b-1)) & UVCONST(1) ) {
+        V = mulsubmod(V, W, P, n);
+        W = mulsubmod(W, W, 2, n);
+      } else {
+        W = mulsubmod(V, W, P, n);
+        V = mulsubmod(V, V, 2, n);
+      }
+    }
+  }
+  if (V == 2 || V == (n-2))
+    return 1;
+  while (s-- > 1) {
+    if (V == 0)
+      return 1;
+    V = mulsubmod(V, V, 2, n);
+    if (V == 2)
+      return 0;
+  }
+  return 0;
+}
+
+/*
+ * The Frobenius-Underwood test has no known counterexamples below 10^13, but
+ * has not been extensively tested above that.  This is the Minimal Lambda+2
+ * test from section 9 of "Quadratic Composite Tests" by Paul Underwood.
+ *
+ * It is generally slower than the AES Lucas test, but for large values will
+ * be faster than a BPSW test.  Currently it is of use mainly for numbers
+ * larger than 2^64 as an additional non-correlated test.
+ *
+ * TODO: Convert to Montgomery
+ */
+int _XS_is_frobenius_underwood_pseudoprime(UV n)
+{
+  int bit;
+  UV x, result, multiplier, a, b, np1, len, t1, t2, na;
+  IV t;
+
+  if (n < 2) return 0;
+  if (n < 4) return 1;
+  if ((n % 2) == 0) return 0;
+  if (is_perfect_square(n,0)) return 0;
+  if (n == UV_MAX) return 0;
+
+  x = 0;
+  t = -1;
+  while ( jacobi_iu( t, n ) != -1 ) {
+    x++;
+    t = (IV)(x*x) - 4;
+  }
+  result = addmod( addmod(x, x, n), 5, n);
+  multiplier = addmod(x, 2, n);
+
+  a = 1;
+  b = 2;
+  np1 = n+1;
+  { UV v = np1; len = 1;  while (v >>= 1) len++; }
+
+  if (x == 0) {
+    for (bit = len-2; bit >= 0; bit--) {
+      t2 = addmod(b, b, n);
+      na = mulmod(a, t2, n);
+      t1 = addmod(b, a, n);
+      t2 = submod(b, a, n);
+      b = mulmod(t1, t2, n);
+      a = na;
+      if ( (np1 >> bit) & UVCONST(1) ) {
+        t1 = mulmod(a, 2, n);
+        na = addmod(t1, b, n);
+        t1 = addmod(b, b, n);
+        b = submod(t1, a, n);
+        a = na;
+      }
+    }
+  } else {
+    for (bit = len-2; bit >= 0; bit--) {
+      t1 = mulmod(a, x, n);
+      t2 = addmod(b, b, n);
+      t1 = addmod(t1, t2, n);
+      na = mulmod(a, t1, n);
+      t1 = addmod(b, a, n);
+      t2 = submod(b, a, n);
+      b = mulmod(t1, t2, n);
+      a = na;
+      if ( (np1 >> bit) & UVCONST(1) ) {
+        t1 = mulmod(a, multiplier, n);
+        na = addmod(t1, b, n);
+        t1 = addmod(b, b, n);
+        b = submod(t1, a, n);
+        a = na;
+      }
+    }
+  }
+  if (_XS_get_verbose()>1) printf("%"UVuf" is %s with x = %"UVuf"\n", n, (a == 0 && b == result) ? "probably prime" : "composite", x);
+  if (a == 0 && b == result)
+    return 1;
+  return 0;
+}
+
+
+/******************************************************************************/
+
+
+/* Select M-R bases from http://miller-rabin.appspot.com/, 26 July 2013 */
+#if BITS_PER_WORD == 32
+static const UV mr_bases_small_2[2] = {31, 73};
+static const UV mr_bases_small_3[3] = {2, 7, 61};
+#else
+static const UV mr_bases_large_1[1] = { UVCONST(  9345883071009581737 ) };
+static const UV mr_bases_large_2[2] = { UVCONST(         336781006125 ),
+                                        UVCONST(     9639812373923155 ) };
+static const UV mr_bases_large_3[3] = { UVCONST(  4230279247111683200 ),
+                                        UVCONST( 14694767155120705706 ),
+                                        UVCONST( 16641139526367750375 ) };
+static const UV mr_bases_large_7[7] = { 2, 325, 9375, 28178, 450775, 9780504, 1795265022 };
+#endif
+
+int _XS_is_prob_prime(UV n)
+{
+  int ret;
+
+  if (n < 11) {
+    if (n == 2 || n == 3 || n == 5 || n == 7)     return 2;
+    else                                          return 0;
+  }
+  if (!(n%2) || !(n%3) || !(n%5) || !(n%7))       return 0;
+  if (n <  121) /* 11*11 */                       return 2;
+  if (!(n%11) || !(n%13) || !(n%17) || !(n%19) ||
+      !(n%23) || !(n%29) || !(n%31) || !(n%37) ||
+      !(n%41) || !(n%43) || !(n%47) || !(n%53))   return 0;
+  if (n < 3481) /* 59*59 */                       return 2;
+
+#if BITS_PER_WORD == 32
+  /* We could use one base when n < 49191, two when n < 360018361. */
+  if (n < UVCONST(9080191))
+    ret = _XS_miller_rabin(n, mr_bases_small_2, 2);
+  else
+    ret = _XS_miller_rabin(n, mr_bases_small_3, 3);
+#else
+  /* AESLSP test costs about 1.5 Selfridges, vs. ~2.2 for strong Lucas.
+   * So it works out to be faster to do AES-BPSW vs. 3 M-R tests. */
+  if (n < UVCONST(341531))
+    ret = _XS_miller_rabin(n, mr_bases_large_1, 1);
+  else if (n < UVCONST(1050535501))
+    ret = _XS_miller_rabin(n, mr_bases_large_2, 2);
+  else
+    ret = _XS_BPSW(n);
+  /*
+    ret = efficient_mr64(mr_bases_large_7, 7, n);
+    ret = _XS_miller_rabin(n, mr_bases_large_7, 7);
+  */
+#endif
+  return 2*ret;
+}
diff --git a/primality.h b/primality.h
new file mode 100644
index 0000000..ec7e0f9
--- /dev/null
+++ b/primality.h
@@ -0,0 +1,22 @@
+#ifndef MPU_PRIMALITY_H
+#define MPU_PRIMALITY_H
+
+#include "ptypes.h"
+
+#if BITS_PER_WORD == 64 && HAVE_STD_U64 && defined(__GNUC__) && defined(__x86_64__)
+#define USE_MONT_PRIMALITY 1
+#else
+#define USE_MONT_PRIMALITY 0
+#endif
+
+extern int _XS_is_pseudoprime(UV const n, UV a);
+extern int _XS_miller_rabin(UV const n, const UV *bases, int nbases);
+extern void lucas_seq(UV* U, UV* V, UV* Qk,  UV n, IV P, IV Q, UV k);
+extern int _XS_is_lucas_pseudoprime(UV n, int strength);
+extern int _XS_is_frobenius_underwood_pseudoprime(UV n);
+extern int _XS_is_almost_extra_strong_lucas_pseudoprime(UV n, UV increment);
+
+extern int _XS_BPSW(UV const n);
+extern int _XS_is_prob_prime(UV n);
+
+#endif
diff --git a/util.c b/util.c
index 523ce65..b4436af 100644
--- a/util.c
+++ b/util.c
@@ -58,7 +58,7 @@
 #include "ptypes.h"
 #include "util.h"
 #include "sieve.h"
-#include "factor.h"
+#include "primality.h"
 #include "cache.h"
 #include "lehmer.h"
 
diff --git a/util.h b/util.h
index 4194879..efc4107 100644
--- a/util.h
+++ b/util.h
@@ -47,4 +47,15 @@ static UV isqrt(UV n) {
   return root;
 }
 
+#ifdef FUNC_gcd_ui
+static UV gcd_ui(UV x, UV y) {
+  UV t;
+  if (y < x) { t = x; x = y; y = t; }
+  while (y > 0) {
+    t = y;  y = x % y;  x = t;  /* y1 <- x0 % y0 ; x1 <- y0 */
+  }
+  return x;
+}
+#endif
+
 #endif
diff --git a/xt/primality-small.pl b/xt/primality-small.pl
index 70d5900..98491f4 100755
--- a/xt/primality-small.pl
+++ b/xt/primality-small.pl
@@ -5,7 +5,7 @@ $| = 1;  # fast pipes
 
 # Make sure the is_prob_prime functionality is working for small inputs.
 # Good for making sure the first few M-R bases are set up correctly.
-my $limit = 1_000_000_000;
+my $limit = shift || 1_000_000_000;
 
 use Math::Prime::Util qw/is_prob_prime/;
 # Use another code base for comparison.

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git