[libmath-prime-util-perl] 115/181: Removed old SQUFOF code. Faster is_perfect_square. Streamline trial division pre-factoring.

Thu May 21 18:51:12 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.36
in repository libmath-prime-util-perl.

commit 5be203368d0f96cf19210e3697df459ce62ace9a
Author: Dana Jacobsen <dana at acm.org>
Date:   Sun Jan 5 00:19:58 2014 -0800

    Removed old SQUFOF code.  Faster is_perfect_square.  Streamline trial division pre-factoring.
---
 Changes                            |   3 +
 XS.xs                              |  23 +--
 examples/bench-factor-extra.pl     |   2 -
 examples/bench-factor-semiprime.pl |   8 +-
 examples/test-factor-gnufactor.pl  |  13 +-
 factor.c                           | 375 ++++++++++---------------------------
 factor.h                           |   1 -
 lib/Math/Prime/Util.pm             |   8 +-
 primality.c                        |  24 +--
 t/50-factoring.t                   |   3 +-
 util.h                             |  20 +-
 11 files changed, 144 insertions(+), 336 deletions(-)

diff --git a/Changes b/Changes
index 4712ef8..050084c 100644
--- a/Changes
+++ b/Changes
@@ -63,6 +63,9 @@ Revision history for Perl module Math::Prime::Util
     - While Math::BigInt has the bgcd function, it is slow for native numbers,
       even with the Pari or GMP back ends.  The gcd here is 20-100x faster.
 
+    - Removed the old SQUFOF code, so the racing version is the only one.  It
+      was already the only one being used.
+
 
 0.35  2013-12-08
 
diff --git a/XS.xs b/XS.xs
index 2cae867..c6f7254 100644
--- a/XS.xs
+++ b/XS.xs
@@ -361,11 +361,10 @@ trial_factor(IN UV n, ...)
     fermat_factor = 1
     holf_factor = 2
     squfof_factor = 3
-    rsqufof_factor = 4
-    pbrent_factor = 5
-    prho_factor = 6
-    pplus1_factor = 7
-    pminus1_factor = 8
+    pbrent_factor = 4
+    prho_factor = 5
+    pplus1_factor = 6
+    pminus1_factor = 7
   PPCODE:
     if (n == 0)  XSRETURN_UV(0);
     while ( (n% 2) == 0 ) {  n /=  2;  XPUSHs(sv_2mortal(newSVuv( 2 ))); }
@@ -386,19 +385,16 @@ trial_factor(IN UV n, ...)
         case 3:  arg1 = (items < 2)  ?   4*1024*1024  : SvUV(ST(1));
                  nfactors = squfof_factor (n, factors, arg1);  break;
         case 4:  arg1 = (items < 2)  ?   4*1024*1024  : SvUV(ST(1));
-                 nfactors = racing_squfof_factor(n, factors, arg1);  break;
-        case 5:  arg1 = (items < 2)  ?   4*1024*1024  : SvUV(ST(1));
                  arg2 = (items < 3)  ?             1  : SvUV(ST(2));
                  nfactors = pbrent_factor (n, factors, arg1, arg2);  break;
-        case 6:  arg1 = (items < 2)  ?   4*1024*1024  : SvUV(ST(1));
+        case 5:  arg1 = (items < 2)  ?   4*1024*1024  : SvUV(ST(1));
                  nfactors = prho_factor   (n, factors, arg1);  break;
-        case 7:  arg1 = (items < 2)  ?           200  : SvUV(ST(1));
+        case 6:  arg1 = (items < 2)  ?           200  : SvUV(ST(1));
                  nfactors = pplus1_factor (n, factors, arg1);  break;
-        case 8:  arg1 = (items < 2)  ?   1*1024*1024  : SvUV(ST(1));
-                 arg2 = (items < 3)  ?             0  : SvUV(ST(2));
-                 if (arg2 == 0) arg2 = 10*arg1;  /* default B2 */
+        case 7:
+        default: arg1 = (items < 2)  ?   1*1024*1024  : SvUV(ST(1));
+                 arg2 = (items < 3)  ?       10*arg1  : SvUV(ST(2));
                  nfactors = pminus1_factor(n, factors, arg1, arg2);  break;
-        default: break;
       }
       EXTEND(SP, nfactors);
       for (i = 0; i < nfactors; i++)
@@ -606,7 +602,6 @@ factor(IN SV* svn)
       return; /* skip implicit PUTBACK */
     }
 
-
 void
 divisor_sum(IN SV* svn, ...)
   PREINIT:
diff --git a/examples/bench-factor-extra.pl b/examples/bench-factor-extra.pl
index 674ce19..d5726d8 100755
--- a/examples/bench-factor-extra.pl
+++ b/examples/bench-factor-extra.pl
@@ -58,7 +58,6 @@ sub test_at_digits {
     $nfactored{'pminus1'} += $calc_nfacs->(Math::Prime::Util::pminus1_factor($_, $p1smooth));
     $nfactored{'pplus1'} += $calc_nfacs->(Math::Prime::Util::pplus1_factor($_, $p1smooth));
     $nfactored{'squfof'} += $calc_nfacs->(Math::Prime::Util::squfof_factor($_, $sqrounds));
-    $nfactored{'rsqufof'} += $calc_nfacs->(Math::Prime::Util::rsqufof_factor($_, $rsqrounds));
     #$nfactored{'trial'} += $calc_nfacs->(Math::Prime::Util::trial_factor($_));
     #$nfactored{'fermat'} += $calc_nfacs->(Math::Prime::Util::fermat_factor($_, $rounds));
     $nfactored{'holf'} += $calc_nfacs->(Math::Prime::Util::holf_factor($_, $hrounds));
@@ -79,7 +78,6 @@ sub test_at_digits {
     "fermat"  => sub { Math::Prime::Util::fermat_factor($_, $rounds) for @nums},
     "holf"    => sub { Math::Prime::Util::holf_factor($_, $hrounds) for @nums },
     "squfof"  => sub { Math::Prime::Util::squfof_factor($_, $sqrounds) for @nums },
-    "rsqufof" => sub { Math::Prime::Util::rsqufof_factor($_) for @nums },
     "trial"   => sub { Math::Prime::Util::trial_factor($_) for @nums },
   };
   delete $lref->{'fermat'} if $digits >= 9;
diff --git a/examples/bench-factor-semiprime.pl b/examples/bench-factor-semiprime.pl
index b9973f0..53b73ed 100755
--- a/examples/bench-factor-semiprime.pl
+++ b/examples/bench-factor-semiprime.pl
@@ -4,7 +4,7 @@ use warnings;
 $| = 1;  # fast pipes
 srand(377);
 
-use Math::Prime::Util qw/factor -nobigint/;
+use Math::Prime::Util qw/factor/;
 use Math::Factor::XS qw/prime_factors/;
 use Math::Pari qw/factorint/;
 use Benchmark qw/:all/;
@@ -93,9 +93,9 @@ foreach my $sp (@semiprimes) {
 print "OK\n";
 
 my %compare = (
-    'MPU'   => sub { factor($_) for @semiprimes; },
-    'MFXS'  => sub { prime_factors($_) for @semiprimes; },
-    'Pari'  => sub { foreach my $n (@semiprimes) { my @factors; my ($pn,$pc) = @{factorint($n)}; push @factors, (int($pn->[$_])) x $pc->[$_] for (0 .. $#{$pn}); } }
+    'MPU'   => sub { do { my @f = factor($_) } for @semiprimes; },
+    'MFXS'  => sub { do { my @f = prime_factors($_) } for @semiprimes; },
+    'Pari'  => sub { do { my ($pn,$pc) = @{factorint($_)}; my @f = map { int($pn->[$_]) x $pc->[$_] } 0 .. $#$pn; } for @semiprimes; },
 );
 delete $compare{'MFXS'} if $skip_mfxs;
 
diff --git a/examples/test-factor-gnufactor.pl b/examples/test-factor-gnufactor.pl
index 4546910..2e3b43e 100755
--- a/examples/test-factor-gnufactor.pl
+++ b/examples/test-factor-gnufactor.pl
@@ -25,15 +25,16 @@ my $num = 1000;
 # large numbers.  You'll probably want to turn it off here as it will be
 # many thousands of times slower than MPU and Pari.
 
-# A performance note: MPU and Pari get their results by calling a function.
-# GNU factor gets its result by multiple shells out to /usr/bin/factor with
-# the numbers as command line arguments.  This adds a lot of overhead that
-# has nothing to do with their implementation.  For comparison, try turning
-# on the MPU factor.pl script, and weep at Perl's startup cost.
+# A benchmarking note: in this script, getting MPU and Pari results are done
+# by calling a function, where getting GNU factor results are done via
+# multiple shells to /usr/bin/factor with the inputs as command line
+# arguments.  This adds a lot of overhead that has nothing to do with their
+# implementation.  For comparison, I've included an option for getting MPU
+# factoring via calling the factor.pl script.  Weep at the startup cost.
 
 my $do_gnu = 1;
 my $do_pari = 1;
-my $use_mpu_factor_script = 0;
+my $use_mpu_factor_script = 1;
 
 my $rgen = sub {
   my $range = shift;
diff --git a/factor.c b/factor.c
index 092a2c9..ed61fea 100644
--- a/factor.c
+++ b/factor.c
@@ -11,8 +11,12 @@
 #include "primality.h"
 #define FUNC_isqrt  1
 #define FUNC_gcd_ui 1
+#define FUNC_is_perfect_square 1
 #include "util.h"
 
+/* factor will do trial division through this prime number, must be in table */
+#define TRIAL_TO_PRIME 81
+
 /*
  * You need to remember to use UV for unsigned and IV for signed types that
  * are large enough to hold our data.
@@ -24,6 +28,29 @@
  * match the native integer type used inside our Perl, so just use those.
  */
 
+static const unsigned short primes_small[] =
+  {0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
+   101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
+   193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
+   293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
+   409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,509,
+   521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,617,619,631,
+   641,643,647,653,659,661,673,677,683,691,701,709,719,727,733,739,743,751,
+   757,761,769,773,787,797,809,811,821,823,827,829,839,853,857,859,863,877,
+   881,883,887,907,911,919,929,937,941,947,953,967,971,977,983,991,997,1009,
+   1013,1019,1021,1031,1033,1039,1049,1051,1061,1063,1069,1087,1091,1093,
+   1097,1103,1109,1117,1123,1129,1151,1153,1163,1171,1181,1187,1193,1201,
+   1213,1217,1223,1229,1231,1237,1249,1259,1277,1279,1283,1289,1291,1297,
+   1301,1303,1307,1319,1321,1327,1361,1367,1373,1381,1399,1409,1423,1427,
+   1429,1433,1439,1447,1451,1453,1459,1471,1481,1483,1487,1489,1493,1499,
+   1511,1523,1531,1543,1549,1553,1559,1567,1571,1579,1583,1597,1601,1607,
+   1609,1613,1619,1621,1627,1637,1657,1663,1667,1669,1693,1697,1699,1709,
+   1721,1723,1733,1741,1747,1753,1759,1777,1783,1787,1789,1801,1811,1823,
+   1831,1847,1861,1867,1871,1873,1877,1879,1889,1901,1907,1913,1931,1933,
+   1949,1951,1973,1979,1987,1993,1997,1999,2003,2011};
+#define NPRIMES_SMALL (sizeof(primes_small)/sizeof(primes_small[0]))
+
+
 /* The main factoring loop */
 /* Puts factors in factors[] and returns the number found. */
 int factor(UV n, UV *factors)
@@ -31,23 +58,41 @@ int factor(UV n, UV *factors)
   int nfactors = 0;           /* Number of factored in factors result */
 
   int const verbose = _XS_get_verbose();
-  UV const tlim_lower = 401;  /* Trial division through this prime */
-  UV const tlim = 409;        /* This means we've checked through here */
+  UV f;
   UV tofac_stack[MPU_MAX_FACTORS+1];
   UV fac_stack[MPU_MAX_FACTORS+1];
   int ntofac = 0;             /* Number of items on tofac_stack */
   int nfac = 0;               /* Number of items on fac_stack */
 
-  if (n < 10000000)
-    return trial_factor(n, factors, 0);
-
-  /* Trial division for all factors below tlim */
-  nfactors = trial_factor(n, factors, tlim_lower);
-  n = factors[--nfactors];
+  if (n < 4) {
+    factors[0] = n;
+    return (n == 1) ? 0 : 1;
+  }
+  while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n /= 2; }
+  while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
+  while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
+  f = 7;
+
+  if (f*f <= n) {
+    UV sp = 3;
+    while (++sp < TRIAL_TO_PRIME) {
+      f = primes_small[sp];
+      if (f*f > n) break;
+      while ( (n%f) == 0 ) {
+        factors[nfactors++] = f;
+        n /= f;
+      }
+    }
+  }
+  if (n < f*f) {
+    if (n != 1)
+      factors[nfactors++] = n;
+    return nfactors;
+  }
 
   /* loop over each remaining factor, until ntofac == 0 */
   do {
-    while ( (n >= (tlim*tlim)) && (!_XS_is_prime(n)) ) {
+    while ( (n >= f*f) && (!_XS_is_prime(n)) ) {
       int split_success = 0;
       /* Adjust the number of rounds based on the number size */
       UV const br_rounds = ((n>>29) < 100000) ?  1500 :  1500;
@@ -60,8 +105,8 @@ int factor(UV n, UV *factors)
       }
       /* SQUFOF with these parameters gets 99.9% of everything left */
       if (!split_success && n < (UV_MAX>>2)) {
-        split_success = racing_squfof_factor(n,tofac_stack+ntofac, sq_rounds)-1;
-        if (verbose) printf("rsqufof %d\n", split_success);
+        split_success = squfof_factor(n,tofac_stack+ntofac, sq_rounds)-1;
+        if (verbose) printf("squfof %d\n", split_success);
       }
       /* At this point we should only have 16+ digit semiprimes. */
       /* This p-1 gets about 2/3 of what makes it through the above */
@@ -89,8 +134,7 @@ int factor(UV n, UV *factors)
         n = tofac_stack[ntofac];  /* Set n to the other one */
       } else {
         /* Factor via trial division.  Nothing should make it here. */
-        UV f = tlim;
-        UV m = tlim % 30;
+        UV m = f % 30;
         UV limit = isqrt(n);
         if (verbose) printf("doing trial on %"UVuf"\n", n);
         while (f <= limit) {
@@ -132,6 +176,7 @@ int factor(UV n, UV *factors)
   return nfactors;
 }
 
+
 int factor_exp(UV n, UV *factors, UV* exponents)
 {
   int i, j, nfactors;
@@ -159,32 +204,8 @@ int factor_exp(UV n, UV *factors, UV* exponents)
 }
 
 
-
-static const unsigned short primes_small[] =
-  {0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
-   101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
-   193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
-   293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
-   409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,509,
-   521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,617,619,631,
-   641,643,647,653,659,661,673,677,683,691,701,709,719,727,733,739,743,751,
-   757,761,769,773,787,797,809,811,821,823,827,829,839,853,857,859,863,877,
-   881,883,887,907,911,919,929,937,941,947,953,967,971,977,983,991,997,1009,
-   1013,1019,1021,1031,1033,1039,1049,1051,1061,1063,1069,1087,1091,1093,
-   1097,1103,1109,1117,1123,1129,1151,1153,1163,1171,1181,1187,1193,1201,
-   1213,1217,1223,1229,1231,1237,1249,1259,1277,1279,1283,1289,1291,1297,
-   1301,1303,1307,1319,1321,1327,1361,1367,1373,1381,1399,1409,1423,1427,
-   1429,1433,1439,1447,1451,1453,1459,1471,1481,1483,1487,1489,1493,1499,
-   1511,1523,1531,1543,1549,1553,1559,1567,1571,1579,1583,1597,1601,1607,
-   1609,1613,1619,1621,1627,1637,1657,1663,1667,1669,1693,1697,1699,1709,
-   1721,1723,1733,1741,1747,1753,1759,1777,1783,1787,1789,1801,1811,1823,
-   1831,1847,1861,1867,1871,1873,1877,1879,1889,1901,1907,1913,1931,1933,
-   1949,1951,1973,1979,1987,1993,1997,1999,2003,2011};
-#define NPRIMES_SMALL (sizeof(primes_small)/sizeof(primes_small[0]))
-
 int trial_factor(UV n, UV *factors, UV maxtrial)
 {
-  UV f, limit, newlimit;
   int nfactors = 0;
 
   if (maxtrial == 0)  maxtrial = UV_MAX;
@@ -194,56 +215,38 @@ int trial_factor(UV n, UV *factors, UV maxtrial)
     factors[0] = n;
     return (n == 1) ? 0 : 1;
   }
-  /* Trial division for 2, 3, 5, 7, and see if we're done */
+  /* Trial division for 2, 3, 5 immediately */
   while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n /= 2; }
   if (3<=maxtrial) while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
   if (5<=maxtrial) while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
-  if (7<=maxtrial) while ( (n % 7) == 0 ) { factors[nfactors++] = 7; n /= 7; }
-  f = 11;
-  if ( (n < (f*f)) || (maxtrial < f) ) {
-    if (n != 1)
-      factors[nfactors++] = n;
-    return nfactors;
-  }
-
-  /* Trial division to this number at most.  Reduced as we find factors. */
-  limit = isqrt(n);
-  if (limit > maxtrial)
-    limit = maxtrial;
 
-  /* Use the table of small primes to quickly do trial division. */
-  {
-    UV sp = 5;
-    UV slimit = (limit < 2003) ? limit : 2003;
-    f = primes_small[sp];
-    while (f <= slimit) {
-      if ( (n%f) == 0 ) {
-        do {
-          factors[nfactors++] = f;
-          n /= f;
-        } while ( (n%f) == 0 );
-        newlimit = isqrt(n);
-        if (newlimit < slimit)  slimit = newlimit;
-        if (newlimit < limit)   limit = newlimit;
+  if (7*7 <= n) {
+    UV f, sp = 3;
+    while (++sp < NPRIMES_SMALL) {
+      f = primes_small[sp];
+      if (f*f > n || f > maxtrial) break;
+      while ( (n%f) == 0 ) {
+        factors[nfactors++] = f;
+        n /= f;
       }
-      f = primes_small[++sp];
     }
-  }
-
-  /* Trial division using a mod-30 wheel for larger values */
-  if (f <= limit) {
-    UV m = f % 30;
-    while (f <= limit) {
-      if ( (n%f) == 0 ) {
-        do {
-          factors[nfactors++] = f;
-          n /= f;
-        } while ( (n%f) == 0 );
-        newlimit = isqrt(n);
-        if (newlimit < limit)  limit = newlimit;
+    /* Trial division using a mod-30 wheel for larger values */
+    if (f*f <= n && f <= maxtrial) {
+      UV newlimit, limit = isqrt(n);
+      if (limit > maxtrial) limit = maxtrial;
+      UV m = f % 30;
+      while (f <= limit) {
+        if ( (n%f) == 0 ) {
+          do {
+            factors[nfactors++] = f;
+            n /= f;
+          } while ( (n%f) == 0 );
+          newlimit = isqrt(n);
+          if (newlimit < limit)  limit = newlimit;
+        }
+        f += wheeladvance30[m];
+        m = nextwheel30[m];
       }
-      f += wheeladvance30[m];
-      m = nextwheel30[m];
     }
   }
   /* All done! */
@@ -253,74 +256,6 @@ int trial_factor(UV n, UV *factors, UV maxtrial)
 }
 
 
-/* Return 0 if n is not a perfect square.  Set sqrtn to int(sqrt(n)) if so.
- *
- * Some simple solutions:
- *
- *     return ( ((n&2)!= 0) || ((n&7)==5) || ((n&11) == 8) )  ?  0  :  1;
- *
- * or:
- *
- *     m = n & 31;
- *     if ( m==0 || m==1 || m==4 || m==9 || m==16 || m==17 || m==25 )
- *       ...test for perfect square...
- *
- * or:
- *
- *     if (  ((0x0202021202030213ULL >> (n & 63)) & 1) &&
- *           ((0x0402483012450293ULL >> (n % 63)) & 1) &&
- *           ((0x218a019866014613ULL >> ((n % 65) & 63)) & 1) &&
- *           ((0x23b                 >> (n % 11) & 1)) ) {
- *
- *
- * The following Bloom filter cascade works very well indeed.  Read all
- * about it here: http://mersenneforum.org/showpost.php?p=110896
- */
-static int is_perfect_square(UV n, UV* sqrtn)
-{
-  UV m;
-  m = n & 127;
-  if ((m*0x8bc40d7d) & (m*0xa1e2f5d1) & 0x14020a)  return 0;
-  /* 82% of non-squares rejected here */
-
-#if 0
-  /* The big deal with this technique is that you do two total operations,
-   * one cheap (the & 127 above), one expensive (the modulo below) on n.
-   * The rest of the operations are 32-bit operations.  This is a huge win
-   * if n is multiprecision.
-   * However, in this file we're doing native precision sqrt, so it just
-   * isn't expensive enough to justify this second filter set.
-   */
-  lm = n % UVCONST(63*25*11*17*19*23*31);
-  m = lm % 63;
-  if ((m*0x3d491df7) & (m*0xc824a9f9) & 0x10f14008) return 0;
-  m = lm % 25;
-  if ((m*0x1929fc1b) & (m*0x4c9ea3b2) & 0x51001005) return 0;
-  m = 0xd10d829a*(lm%31);
-  if (m & (m+0x672a5354) & 0x21025115) return 0;
-  m = lm % 23;
-  if ((m*0x7bd28629) & (m*0xe7180889) & 0xf8300) return 0;
-  m = lm % 19;
-  if ((m*0x1b8bead3) & (m*0x4d75a124) & 0x4280082b) return 0;
-  m = lm % 17;
-  if ((m*0x6736f323) & (m*0x9b1d499) & 0xc0000300) return 0;
-  m = lm % 11;
-  if ((m*0xabf1a3a7) & (m*0x2612bf93) & 0x45854000) return 0;
-  /* 99.92% of non-squares are rejected now */
-#endif
-#if 0
-  /* This could save time on some platforms, but not on x86 */
-  m = n % 63;
-  if ((m*0x3d491df7) & (m*0xc824a9f9) & 0x10f14008) return 0;
-#endif
-  m = isqrt(n);
-  if (n != (m*m))
-    return 0;
-
-  if (sqrtn != 0) *sqrtn = m;
-  return 1;
-}
-
 static int _divisors_from_factors(UV v, UV npe, UV* fp, UV* fe, UV* res) {
   UV p, e, i;
   if (npe == 0) return 0;
@@ -401,9 +336,9 @@ UV divisor_sum(UV n, UV k)
   UV product = 1;
 
   if (k > 5 || (k > 0 && n >= sigma_overflow[k-1])) return 0;
-  if (n == 0) return (k == 0) ? 2 : 1;  /* divisors are [0,1] */
-  if (n == 1) return 1;                 /* divisors are [1]   */
-  nfac = factor(n, factors);
+  if (n <= 1)                               /* n=0  divisors are [0,1] */
+    return (n == 1) ? 1 : (k == 0) ? 2 : 1; /* n=1  divisors are [1]   */
+  nfac = factor(n,factors);
   if (k == 0) {
     for (i = 0; i < nfac; i++) {
       UV e = 1,  f = factors[i];
@@ -491,7 +426,8 @@ int holf_factor(UV n, UV *factors, UV rounds)
      * so we won't be able to accurately detect it anyway. */
     s++;    /* s = ceil(sqrt(n*i)) */
     m = sqrmod(s, n);
-    if (is_perfect_square(m, &f)) {
+    if (is_perfect_square(m)) {
+      f = isqrt(m);
       f = gcd_ui( (s>f) ? s-f : f-s, n);
       /* This should always succeed, but with overflow concerns.... */
       if ((f == 1) || (f == n))
@@ -799,131 +735,7 @@ int pplus1_factor(UV n, UV *factors, UV B1)
 }
 
 
-/* My modification of Ben Buhrow's modification of Bob Silverman's SQUFOF code.
- */
-
-int squfof_factor(UV n, UV *factors, UV rounds)
-{
-  IV qqueue[100+1];
-  IV qpoint;
-  IV rounds2 = (IV) (rounds/16);
-  UV temp;
-  IV iq,ll,l2,p,pnext,q,qlast,r,s,t,i;
-  IV jter, iter;
-  int reloop;
-
-  MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in squfof_factor");
-
-  /* TODO:  What value of n leads to overflow? */
-
-  qlast = 1;
-  s = isqrt(n);
-
-  p = s;
-  temp = n - (s*s);                 /* temp = n - floor(sqrt(n))^2   */
-  if (temp == 0) {
-    factors[0] = s;
-    factors[1] = s;
-    return 2;
-  }
-
-  q = temp;              /* q = excess of n over next smaller square */
-  ll = 1 + 2*(IV)sqrt((double)(p+p));
-  l2 = ll/2;
-  qpoint = 0;
-
-  /*  In the loop below, we need to check if q is a square right before   */
-  /*  the end of the loop.  Is there a faster way? The current way is     */
-  /*  EXPENSIVE! (many branches and double prec sqrt)                     */
-
-  for (jter=0; (UV)jter < rounds; jter++) {
-    iq = (s + p)/q;
-    pnext = iq*q - p;
-    if (q <= ll) {
-      if ((q & 1) == 0) { qqueue[qpoint] = q/2; if (++qpoint>=100) jter = -1; }
-      else if (q <= l2) { qqueue[qpoint] = q;   if (++qpoint>=100) jter = -1; }
-      if (jter < 0) {
-        factors[0] = n;  return 1;
-      }
-    }
-
-    t = qlast + iq*(p - pnext);
-    qlast = q;
-    q = t;
-    p = pnext;                          /* check for square; even iter   */
-    if (jter & 1) continue;             /* jter is odd:omit square test  */
-    r = isqrt(q);                       /* r = floor(sqrt(q))      */
-    if (q != r*r) continue;
-    if (qpoint == 0) break;
-    qqueue[qpoint] = 0;
-    reloop = 0;
-    for (i=0; i<qpoint-1; i+=2) {    /* treat queue as list for simplicity*/
-      if (r == qqueue[i]) { reloop = 1; break; }
-      if (r == qqueue[i+1]) { reloop = 1; break; }
-    }
-    if (reloop || (r == qqueue[qpoint-1])) continue;
-    break;
-  }   /* end of main loop */
-
-  if ((UV)jter >= rounds) {
-    factors[0] = n;  return 1;
-  }
-
-  qlast = r;
-  p = p + r*((s - p)/r);
-  q = (n - (p*p)) / qlast;               /* q = (n - p*p)/qlast (div is exact)*/
-  for (iter=0; iter<rounds2; iter++) {   /* unrolled second main loop */
-    iq = (s + p)/q;
-    pnext = iq*q - p;
-    if (p == pnext) break;
-    t = qlast + iq*(p - pnext);
-    qlast = q;
-    q = t;
-    p = pnext;
-    iq = (s + p)/q;
-    pnext = iq*q - p;
-    if (p == pnext) break;
-    t = qlast + iq*(p - pnext);
-    qlast = q;
-    q = t;
-    p = pnext;
-    iq = (s + p)/q;
-    pnext = iq*q - p;
-    if (p == pnext) break;
-    t = qlast + iq*(p - pnext);
-    qlast = q;
-    q = t;
-    p = pnext;
-    iq = (s + p)/q;
-    pnext = iq*q - p;
-    if (p == pnext) break;
-    t = qlast + iq*(p - pnext);
-    qlast = q;
-    q = t;
-    p = pnext;
-  }
-
-  if (iter >= rounds2) {
-    factors[0] = n;  return 1;
-  }
-
-  if ((q & 1) == 0) q/=2;      /* q was factor or 2*factor   */
-
-  if ( (q == 1) || ((UV)q == n) ) {
-    factors[0] = n;  return 1;
-  }
-
-  p = n/q;
-
-  /* printf(" squfof found %lu = %lu * %lu in %ld/%ld rounds\n", n, p, q, jter, iter); */
-
-  factors[0] = p;
-  factors[1] = q;
-  MPUassert( factors[0] * factors[1] == n , "incorrect factoring");
-  return 2;
-}
-
-/* Another version, based on Ben Buhrow's racing SQUFOF. */
+/* SQUFOF, based on Ben Buhrow's racing version. */
 
 typedef struct
 {
@@ -985,15 +797,16 @@ static void squfof_unit(UV n, mult_t* mult_save, UV* f)
       SQUARE_SEARCH_ITERATION;
 
       /* Even iteration.  Check for square: Qn = S*S */
-      if (is_perfect_square( Qn, &S ))
+      if (is_perfect_square(Qn))
         break;
 
       /* Odd iteration. */
       SQUARE_SEARCH_ITERATION;
     }
+    S = isqrt(Qn);
     /* printf("found square %lu after %lu iterations with mult %d\n", Qn, i, mult_save->mult); */
 
-    /*  Reduce to G0 */
+    /* Reduce to G0 */
     Ro = P + S*((b0 - P)/S);
     t1 = Ro;
     So = (n - t1*t1)/S;
@@ -1033,7 +846,7 @@ static const UV squfof_multipliers[] =
     3*11,     3,     5*11,   5,   7*11,   7,   11,     1     };
 #define NSQUFOF_MULT (sizeof(squfof_multipliers)/sizeof(squfof_multipliers[0]))
 
-int racing_squfof_factor(UV n, UV *factors, UV rounds)
+int squfof_factor(UV n, UV *factors, UV rounds)
 {
   const UV big2 = UV_MAX >> 2;
   mult_t mult_save[NSQUFOF_MULT];
@@ -1042,7 +855,7 @@ int racing_squfof_factor(UV n, UV *factors, UV rounds)
   UV rounds_done = 0;
 
   /* Caller should have handled these trivial cases */
-  MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in racing_squfof_factor");
+  MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in squfof_factor");
 
   /* Too big */
   if (n > big2) {
diff --git a/factor.h b/factor.h
index 7ee466a..7c4682c 100644
--- a/factor.h
+++ b/factor.h
@@ -18,7 +18,6 @@ extern int prho_factor(UV n, UV *factors, UV maxrounds);
 extern int pminus1_factor(UV n, UV *factors, UV B1, UV B2);
 extern int pplus1_factor(UV n, UV *factors, UV B);
 extern int squfof_factor(UV n, UV *factors, UV rounds);
-extern int racing_squfof_factor(UV n, UV *factors, UV rounds);
 
 extern UV* _divisor_list(UV n, UV *num_divisors);
 
diff --git a/lib/Math/Prime/Util.pm b/lib/Math/Prime/Util.pm
index c8b410c..1018b31 100644
--- a/lib/Math/Prime/Util.pm
+++ b/lib/Math/Prime/Util.pm
@@ -136,7 +136,6 @@ BEGIN {
     *fermat_factor  = \&Math::Prime::Util::PP::fermat_factor;
     *holf_factor    = \&Math::Prime::Util::PP::holf_factor;
     *squfof_factor  = \&Math::Prime::Util::PP::squfof_factor;
-    *rsqufof_factor = \&Math::Prime::Util::PP::squfof_factor;
     *pbrent_factor  = \&Math::Prime::Util::PP::pbrent_factor;
     *prho_factor    = \&Math::Prime::Util::PP::prho_factor;
     *pminus1_factor = \&Math::Prime::Util::PP::pminus1_factor;
@@ -3863,10 +3862,7 @@ same advantages and disadvantages as Fermat's method.
 
 =head2 squfof_factor
 
-=head2 rsqufof_factor
-
   my @factors = squfof_factor($n);
-  my @factors = rsqufof_factor($n);  # racing multiplier version
 
 Produces factors, not necessarily prime, of the positive number input.  An
 optional number of rounds can be given as a second parameter.  It is possible
@@ -4799,9 +4795,7 @@ The current implementation does not use these ideas.  Future versions might.
 The SQUFOF implementation being used is a slight modification to the public
 domain racing version written by Ben Buhrow.  Enhancements with ideas from
 Ben's later code as well as Jason Papadopoulos's public domain implementations
-are planned for a later version.  The old SQUFOF implementation, still included
-in the code, is my modifications to Ben Buhrow's modifications to Bob
-Silverman's code.
+are planned for a later version.
 
 The LMO implementation is based on the 2003 preprint from Christian Bau,
 as well as the 2006 paper from Tomás Oliveira e Silva.  I also want to
diff --git a/primality.c b/primality.c
index 33880a5..5d0fdbf 100644
--- a/primality.c
+++ b/primality.c
@@ -8,6 +8,7 @@
 #include "mulmod.h"
 #define FUNC_isqrt  1
 #define FUNC_gcd_ui 1
+#define FUNC_is_perfect_square
 #include "util.h"
 
 /* Primality related functions, including Montgomery math */
@@ -148,19 +149,6 @@ static int monty_mr64(const uint64_t n, const UV* bases, int cnt)
 
 
 
-/* Helper functions */
-static int is_perfect_square(UV n, UV* sqrtn)
-{
-  UV m;
-  m = n & 127;
-  if ((m*0x8bc40d7d) & (m*0xa1e2f5d1) & 0x14020a)  return 0;
-  m = isqrt(n);
-  if (n != (m*m))
-    return 0;
-  if (sqrtn != 0)
-    *sqrtn = m;
-  return 1;
-}
 static int jacobi_iu(IV in, UV m) {
   int j = 1;
   UV n = (in < 0) ? -in : in;
@@ -286,7 +274,7 @@ int _XS_BPSW(UV const n)
         return 0;
       if (jacobi_iu(D, n) == -1)
         break;
-      if (P == (3+20) && is_perfect_square(n, 0)) return 0;
+      if (P == (3+20) && is_perfect_square(n)) return 0;
       P++;
       if (P > 65535)
         croak("lucas_extrastrong_params: P exceeded 65535");
@@ -423,7 +411,7 @@ int _XS_is_lucas_pseudoprime(UV n, int strength)
       if (gcd_ui(Du, n) > 1 && gcd_ui(Du, n) != n) return 0;
       if (jacobi_iu(D, n) == -1)
         break;
-      if (Du == 21 && is_perfect_square(n, 0)) return 0;
+      if (Du == 21 && is_perfect_square(n)) return 0;
       Du += 2;
       sign = -sign;
     }
@@ -437,7 +425,7 @@ int _XS_is_lucas_pseudoprime(UV n, int strength)
       if (gcd_ui(D, n) > 1 && gcd_ui(D, n) != n) return 0;
       if (jacobi_iu(D, n) == -1)
         break;
-      if (P == 21 && is_perfect_square(n, 0)) return 0;
+      if (P == 21 && is_perfect_square(n)) return 0;
       P++;
     }
   }
@@ -595,7 +583,7 @@ int _XS_is_almost_extra_strong_lucas_pseudoprime(UV n, UV increment)
       return 0;
     if (jacobi_iu(D, n) == -1)
       break;
-    if (P == (3+20*increment) && is_perfect_square(n, 0)) return 0;
+    if (P == (3+20*increment) && is_perfect_square(n)) return 0;
     P += increment;
     if (P > 65535)
       croak("lucas_extrastrong_params: P exceeded 65535");
@@ -680,7 +668,7 @@ int _XS_is_frobenius_underwood_pseudoprime(UV n)
 
   if (n < 7) return (n == 2 || n == 3 || n == 5);
   if ((n % 2) == 0 || n == UV_MAX) return 0;
-  if (is_perfect_square(n,0)) return 0;
+  if (is_perfect_square(n)) return 0;
 
   x = 0;
   t = -1;
diff --git a/t/50-factoring.t b/t/50-factoring.t
index a3af134..49cfbce 100644
--- a/t/50-factoring.t
+++ b/t/50-factoring.t
@@ -113,7 +113,7 @@ plan tests => (3 * scalar @testn)
             + 2*scalar(keys %prime_factors)
             + 4*scalar(keys %all_factors)
             + 2*scalar(keys %factor_exponents)
-            + 10*9
+            + 10*8  # 10 extra factoring tests * 8 algorithms
             + 8
             + 1;
 
@@ -159,7 +159,6 @@ extra_factor_test("trial_factor",  sub {Math::Prime::Util::trial_factor(shift)})
 extra_factor_test("fermat_factor", sub {Math::Prime::Util::fermat_factor(shift)});
 extra_factor_test("holf_factor",   sub {Math::Prime::Util::holf_factor(shift)});
 extra_factor_test("squfof_factor", sub {Math::Prime::Util::squfof_factor(shift)});
-extra_factor_test("rsqufof_factor", sub {Math::Prime::Util::rsqufof_factor(shift)});
 extra_factor_test("pbrent_factor", sub {Math::Prime::Util::pbrent_factor(shift)});
 extra_factor_test("prho_factor",   sub {Math::Prime::Util::prho_factor(shift)});
 extra_factor_test("pminus1_factor",sub {Math::Prime::Util::pminus1_factor(shift)});
diff --git a/util.h b/util.h
index 0c40e59..ff4a99d 100644
--- a/util.h
+++ b/util.h
@@ -46,7 +46,7 @@ extern UV znorder(UV a, UV n);
   #define MPU_PROB_PRIME_BEST  UVCONST(100000000)
 #endif
 
-#ifdef FUNC_isqrt
+#if defined(FUNC_isqrt) || defined(FUNC_is_perfect_square)
 static UV isqrt(UV n) {
   UV root;
 #if BITS_PER_WORD == 32
@@ -101,4 +101,22 @@ static UV lcm_ui(UV x, UV y) {
 }
 #endif
 
+#ifdef FUNC_is_perfect_square
+/* See:  http://mersenneforum.org/showpost.php?p=110896 */
+static int is_perfect_square(UV n)
+{
+  UV m;
+  m = n & 127;
+  if ((m*0x8bc40d7d) & (m*0xa1e2f5d1) & 0x14020a)  return 0;
+  /* If your sqrt is particularly slow, this cuts out another 80%:
+     m = n % 63; if ((m*0x3d491df7) & (m*0xc824a9f9) & 0x10f14008) return 0;
+     and this cuts out some more:
+     m = n % 25; if ((m*0x1929fc1b) & (m*0x4c9ea3b2) & 0x51001005) return 0;
+   */
+  m = (UV) ( sqrt((double) n) + 0.5 );
+  return m*m == n;
+}
+#endif
+
+
 #endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git