[libmath-prime-util-perl] 14/17: Performance enhancements

Thu May 21 18:47:14 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.21
in repository libmath-prime-util-perl.

commit 9285897e07f4c96fc6cae969e5f7a7acc56f4e28
Author: Dana Jacobsen <dana at acm.org>
Date:   Fri Feb 22 16:50:41 2013 -0800

    Performance enhancements
---
 Changes                | 16 +++++++---
 MANIFEST               |  1 +
 XS.xs                  |  6 ++--
 factor.c               | 81 ++++++++++++++++++++++++++++++++++++------------
 lib/Math/Prime/Util.pm | 83 +++++++++++++++++++++++---------------------------
 t/19-moebius.t         | 61 ++++++++++++++++++++++++++++++++-----
 util.c                 |  2 +-
 xt/moebius-mertens.pl  | 35 +++++++++++++++++++++
 8 files changed, 205 insertions(+), 80 deletions(-)

diff --git a/Changes b/Changes
index 714a7af..a2e6c17 100644
--- a/Changes
+++ b/Changes
@@ -4,17 +4,25 @@ Revision history for Perl extension Math::Prime::Util.
 
     - Switch to using Bytes::Random::Secure for random primes.
 
-    - primes.pl: Add circular and Panaitopol primes, speedup Pillai primes.
-
     - Spelling fixes in documentation.
 
-    - Fast return of Euler's totient and Möbius function over a range.
+    - primes.pl: Add circular and Panaitopol primes.
+
+    - euler_phi and moebius now will compute over a range.
 
     - Add mertens function: 1000+ times faster than summing moebius($_).
 
     - Add exp_mangoldt function: exponential of von Mangoldt's function.
 
-    - divisor sum is 2x faster.  Also default to sigma if no sub given.
+    - divisor_sum defaults to sigma if no sub is given (i.e. it sums).
+
+    - Performance:
+       - Speedup factoring small numbers.  With -nobigint factoring from
+         1 to 10M, it's 1.2x faster.  1.5x faster than Math::Factor::XS.
+       - Totient and Möbius over a range are much faster than separate calls.
+       - divisor_sum is 2x faster.
+       - primes.pl is much faster with Pillai primes.
+       - Reduce overhead in euler_phi -- about 2x faster for individual calls.
 
 0.20  3 February 2012
 
diff --git a/MANIFEST b/MANIFEST
index 6945e23..6457a3a 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -87,4 +87,5 @@ t/81-bignum.t
 t/90-release-perlcritic.t
 t/91-release-pod-syntax.t
 t/92-release-pod-coverage.t
+xt/moebius-mertens.pl
 .travis.yml
diff --git a/XS.xs b/XS.xs
index e831b0a..d79b4f7 100644
--- a/XS.xs
+++ b/XS.xs
@@ -219,7 +219,7 @@ _XS_factor(IN UV n)
   PPCODE:
     if (n < 4) {                        /* If n is 0-3, we're done. */
       XPUSHs(sv_2mortal(newSVuv( n )));
-    } else if (n < 2000000) {           /* For small n, just trial division */
+    } else if (n < 10000000) {          /* For small n, just trial division */
       int i;
       UV facs[32];  /* maximum number of factors is log2n */
       UV nfacs = trial_factor(n, facs, 0);
@@ -228,8 +228,8 @@ _XS_factor(IN UV n)
       }
     } else {
       int const verbose = _XS_get_verbose();
-      UV const tlim_lower = 211;  /* Trial division through this prime */
-      UV const tlim = 223;        /* This means we've checked through here */
+      UV const tlim_lower = 401;  /* Trial division through this prime */
+      UV const tlim = 409;        /* This means we've checked through here */
       UV tofac_stack[MPU_MAX_FACTORS+1];
       UV factored_stack[MPU_MAX_FACTORS+1];
       int ntofac = 0;
diff --git a/factor.c b/factor.c
index 5396f78..96c7e64 100644
--- a/factor.c
+++ b/factor.c
@@ -20,50 +20,91 @@
  * match the native integer type used inside our Perl, so just use those.
  */
 
+static const unsigned short primes_small[] =
+  {0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
+   101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
+   193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
+   293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
+   409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,509,
+   521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,617,619,631,
+   641,643,647,653,659,661,673,677,683,691,701,709,719,727,733,739,743,751,
+   757,761,769,773,787,797,809,811,821,823,827,829,839,853,857,859,863,877,
+   881,883,887,907,911,919,929,937,941,947,953,967,971,977,983,991,997,1009,
+   1013,1019,1021,1031,1033,1039,1049,1051,1061,1063,1069,1087,1091,1093,
+   1097,1103,1109,1117,1123,1129,1151,1153,1163,1171,1181,1187,1193,1201,
+   1213,1217,1223,1229,1231,1237,1249,1259,1277,1279,1283,1289,1291,1297,
+   1301,1303,1307,1319,1321,1327,1361,1367,1373,1381,1399,1409,1423,1427,
+   1429,1433,1439,1447,1451,1453,1459,1471,1481,1483,1487,1489,1493,1499,
+   1511,1523,1531,1543,1549,1553,1559,1567,1571,1579,1583,1597,1601,1607,
+   1609,1613,1619,1621,1627,1637,1657,1663,1667,1669,1693,1697,1699,1709,
+   1721,1723,1733,1741,1747,1753,1759,1777,1783,1787,1789,1801,1811,1823,
+   1831,1847,1861,1867,1871,1873,1877,1879,1889,1901,1907,1913,1931,1933,
+   1949,1951,1973,1979,1987,1993,1997,1999,2003,2011};
+#define NPRIMES_SMALL (sizeof(primes_small)/sizeof(primes_small[0]))
 
 int trial_factor(UV n, UV *factors, UV maxtrial)
 {
-  UV f, m, limit;
+  UV f, limit, newlimit;
   int nfactors = 0;
 
   if (maxtrial == 0)  maxtrial = UV_MAX;
 
-  if ( (n < 2) || (maxtrial < 2) ) {
+  /* Cover the cases 0/1/2/3 now */
+  if ( (n < 4) || (maxtrial < 4) ) {
     factors[0] = n;
     return 1;
   }
+  /* Trial division for 2, 3, 5, 7, and see if we're done */
   while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n /= 2; }
-
-  if (3  <= maxtrial) while ( (n %  3) == 0 ) { factors[nfactors++] =  3; n /=  3; }
-  if (5  <= maxtrial) while ( (n %  5) == 0 ) { factors[nfactors++] =  5; n /=  5; }
-  if (7  <= maxtrial) while ( (n %  7) == 0 ) { factors[nfactors++] =  7; n /=  7; }
+  if (3<=maxtrial) while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
+  if (5<=maxtrial) while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
+  if (7<=maxtrial) while ( (n % 7) == 0 ) { factors[nfactors++] = 7; n /= 7; }
   f = 11;
-  m = 11;
-
   if ( (n < (f*f)) || (maxtrial < f) ) {
     if (n != 1)
       factors[nfactors++] = n;
     return nfactors;
   }
 
+  /* Trial division to this number at most.  Reduced as we find factors. */
   limit = (UV) (sqrt(n)+0.1);
   if (limit > maxtrial)
     limit = maxtrial;
 
-  /* wheel 30 */
-  while (f <= limit) {
-    if ( (n%f) == 0 ) {
-      UV newlimit;
-      do {
-        factors[nfactors++] = f;
-        n /= f;
-      } while ( (n%f) == 0 );
-      newlimit = (UV) (sqrt(n)+0.1);
-      if (newlimit < limit)  limit = newlimit;
+  /* Use the table of small primes to quickly do trial division. */
+  {
+    UV sp = 5;
+    f = primes_small[sp];
+    while (f <= limit && f <= 2003) {
+      if ( (n%f) == 0 ) {
+        do {
+          factors[nfactors++] = f;
+          n /= f;
+        } while ( (n%f) == 0 );
+        newlimit = (UV) (sqrt(n)+0.1);
+        if (newlimit < limit)  limit = newlimit;
+      }
+      f = primes_small[++sp];
+    }
+  }
+
+  /* Trial division using a mod-30 wheel for larger values */
+  if (f <= limit) {
+    UV m = f % 30;
+    while (f <= limit) {
+      if ( (n%f) == 0 ) {
+        do {
+          factors[nfactors++] = f;
+          n /= f;
+        } while ( (n%f) == 0 );
+        newlimit = (UV) (sqrt(n)+0.1);
+        if (newlimit < limit)  limit = newlimit;
+      }
+      f += wheeladvance30[m];
+      m = nextwheel30[m];
     }
-    f += wheeladvance30[m];
-    m = nextwheel30[m];
   }
+  /* All done! */
   if (n != 1)
     factors[nfactors++] = n;
   return nfactors;
diff --git a/lib/Math/Prime/Util.pm b/lib/Math/Prime/Util.pm
index 3d111e2..82ca25c 100644
--- a/lib/Math/Prime/Util.pm
+++ b/lib/Math/Prime/Util.pm
@@ -51,10 +51,11 @@ sub _import_nobigint {
   undef *is_prob_prime; *is_prob_prime   = \&_XS_is_prob_prime;
   undef *next_prime;    *next_prime      = \&_XS_next_prime;
   undef *prev_prime;    *prev_prime      = \&_XS_prev_prime;
-  #undef *prime_count;   *prime_count     = \&_XS_prime_count;
+ #undef *prime_count;   *prime_count     = \&_XS_prime_count;
   undef *nth_prime;     *nth_prime       = \&_XS_nth_prime;
   undef *is_strong_pseudoprime;  *is_strong_pseudoprime = \&_XS_miller_rabin;
   undef *miller_rabin;  *miller_rabin    = \&_XS_miller_rabin;
+  undef *mertens;       *mertens         = \&_XS_mertens;
 }
 
 BEGIN {
@@ -1062,10 +1063,6 @@ sub all_factors {
 
 # A008683 Moebius function mu(n)
 # A030059, A013929, A030229, A002321, A005117, A013929 all relate.
-
-# One can argue for the Omega function (A001221), Euler Phi (A000010), and
-# Merten's functions also.
-
 sub moebius {
   my($n, $nend) = @_;
   _validate_positive_integer($n, 1);
@@ -1074,10 +1071,7 @@ sub moebius {
   if (defined $nend) {
     _validate_positive_integer($nend);
     return () if $nend < $n;
-    if ($nend <= $_XS_MAXVAL) {
-      my $mu = _XS_moebius_range($n, $nend);
-      return @$mu;
-    }
+    return @{ _XS_moebius_range($n, $nend) } if $nend <= $_XS_MAXVAL;
     my @mu = map { 1 } 0 .. $nend;
     foreach my $j (2 .. $nend) {
       next unless $mu[$j] == 1;
@@ -1106,15 +1100,16 @@ sub moebius {
   return (((scalar @factors) % 2) == 0) ? 1 : -1;
 }
 
+# A002321 Mertens' function.  mertens(n) = sum(moebius(1,n))
 sub mertens {
   my($n) = @_;
   _validate_positive_integer($n);
-  return (0,1)[$n] if $n <= 1;
   return _XS_mertens($n) if $n <= $_XS_MAXVAL;
   # This is the most basic Deléglise and Rivat algorithm.  u = n^1/2
   # and no segmenting is done.  Their algorithm uses u = n^1/3, breaks
   # the summation into two parts, and calculates those in segments.  Their
   # computation time growth is half of this code.
+  return $n if $n <= 1;
   my $u = int(sqrt($n));
   my @mu = (0, moebius(1, $u));          # Hold values of mu for 0-u
   my $musum = 0;
@@ -1137,25 +1132,20 @@ sub mertens {
 }
 
 
-# Euler Phi, aka Euler Totient.  A000010
-
+# A000010 Euler Phi, aka Euler Totient
 sub euler_phi {
   my($n, $nend) = @_;
   # SAGE defines this to be 0 for all n <= 0.  Others choose differently.
   # I am following SAGE's decision for n <= 0.
   return 0 if defined $n && $n < 0;
   _validate_positive_integer($n);
-  undef $nend if defined $nend && $nend == $n;
 
   # Totient over a range.  Could be improved, as this can use huge memory.
-  if (defined $nend) {
+  if (defined $nend && $nend != $n) {
     _validate_positive_integer($nend);
     return () if $nend < $n;
     # Use XS code if at all possible.  Better memory use.
-    if ($nend <= $_XS_MAXVAL) {
-      my $totients = _XS_totient_range($n, $nend);
-      return @$totients;
-    }
+    return @{ _XS_totient_range($n, $nend) } if $nend <= $_XS_MAXVAL;
     my @totients = map { $_ } 0 .. $nend;
     foreach my $i (2 .. $nend) {
       next unless $totients[$i] == $i;
@@ -1169,38 +1159,40 @@ sub euler_phi {
   }
 
   return (0,1)[$n] if $n <= 1;
-  my %factor_mult;
-  my @factors = grep { !$factor_mult{$_}++ }
-                ($n <= $_XS_MAXVAL) ? _XS_factor($n) : factor($n);
 
-  # Direct from Euler's product formula.  Note division will be exact.
-  #my $totient = $n;
-  #foreach my $factor (@factors) {
-  #  $totient = int($totient/$factor) * ($factor-1);
-  #}
+  if ($n <= $_XS_MAXVAL) {
+    my $last_f = 0;
+    my $totient = 1;
+    foreach my $f ( _XS_factor($n) ) {
+      if ($f == $last_f) {  $totient *= $f;                   }
+      else               {  $totient *= $f-1;  $last_f = $f;  }
+    }
+    return $totient;
+  }
+
+  my %factor_mult;
+  my @factors = grep { !$factor_mult{$_}++ } factor($n);
+  my $totient = 1;
 
-  # Alternate way doing multiplications only.
   if (ref($n) ne 'Math::BigInt') {
-    my $totient = 1;  # $n - $n + 1 will make this a bigint if needed
     foreach my $factor (@factors) {
       $totient *= ($factor - 1);
       $totient *= $factor for (2 .. $factor_mult{$factor});
     }
-    return $totient;
-  }
-
-  # Some real wackiness to solve issues with Math::BigInt::GMP (not seen with
-  # Pari or Calc).  Results of the multiply will go negative if we don't do
-  # this.  Standalone bug:
-  #      perl -E 'my $a = 2931542417; use bigint lib=>'GMP'; my $n = 49754396241690624; my $x = $n*$a; say $x;'
-  # This may be related to RT 71548 of Math::BigInt::GMP.
-  my $totient = $n->copy->bone;
-  foreach my $factor (@factors) {
-    my $f = $n->copy->bzero->badd("$factor");
-    $totient->bmul($f->copy->bsub(1));
-    $totient->bmul($f)  for (2 .. $factor_mult{$factor});
+  } else {
+    # Some real wackiness to solve issues with Math::BigInt::GMP (not seen with
+    # Pari or Calc).  Results of the multiply will go negative if we don't do
+    # this.  To see if you hit the standalone bug:
+    #      perl -E 'my $a = 2931542417; use bigint lib=>'GMP'; my $n = 49754396241690624; my $x = $n*$a; say $x;'
+    # This may be related to RT 71548 of Math::BigInt::GMP.
+    $totient = $n->copy->bone;
+    foreach my $factor (@factors) {
+      my $f = $n->copy->bzero->badd("$factor");
+      $totient->bmul($f->copy->bsub(1));
+      $totient->bmul($f)  for (2 .. $factor_mult{$factor});
+    }
   }
-  $totient;
+  return $totient;
 }
 
 # Jordan's totient -- a generalization of Euler's totient.
@@ -2440,9 +2432,10 @@ is much faster, though a segmented sieve must be used for large C<n> to
 control the memory taken.  Benito and Varona (2008) show a simple C<n/3>
 summation which is much faster and uses less memory.  Better yet is a
 simple C<n^1/2> version of Deléglise and Rivat (1996), which is what the
-current implementation uses.  The best known implementation is Deléglise and
-Rivat's segmented C<n^1/3> algorithm.  In theory using one of the advanced
-prime count algorithms can lead to a faster solution.
+current implementation uses.  Deléglise and Rivat's full segmented C<n^1/3>
+algorithm is faster.  Kuznetsov (2011) gives an alternate method that he
+indicates is even faster.  In theory using one of the advanced prime count
+algorithms can lead to a faster solution.
 
 
 =head2 euler_phi
diff --git a/t/19-moebius.t b/t/19-moebius.t
index 3da2504..1542bc3 100644
--- a/t/19-moebius.t
+++ b/t/19-moebius.t
@@ -5,6 +5,7 @@ use warnings;
 use Test::More;
 use Math::Prime::Util qw/moebius mertens euler_phi jordan_totient divisor_sum/;
 
+my $extra = defined $ENV{RELEASE_TESTING} && $ENV{RELEASE_TESTING};
 my $use64 = Math::Prime::Util::prime_get_config->{'maxbits'} > 32;
 my $broken64 = (18446744073709550592 == ~0);
 $use64 = 0 if $broken64;
@@ -20,10 +21,55 @@ my %mertens = (
       100 =>    1,
      1000 =>    2,
     10000 =>  -23,
-#   100000 =>  -48,
-#  1000000 =>  212,
-# 10000000 => 1037,
+        8 =>   -2,
+       16 =>   -1,
+       32 =>   -4,
+       64 =>   -1,
+      128 =>   -2,
+      256 =>   -1,
+      512 =>   -4,
+     1024 =>   -4,
+     2048 =>    7,
+     4096 =>  -19,
+     8192 =>   22,
 );
+my %big_mertens = (
+   100000 =>  -48,
+  1000000 =>  212,
+ 10000000 => 1037,
+);
+if ($extra && $use64) {
+  %big_mertens = ( %big_mertens,
+          2 =>  0,      # A087987, mertens at primorials
+          6 => -1,
+         30 => -3,
+        210 => -1,
+       2310 => -1,
+      30030 => 16,
+     510510 => -25,
+    9699690 => 278,
+  223092870 => 3516,
+
+    6433477 => 900,     # 30^2
+  109851909 => -4096,   # A084235, 2^12  
+
+      2**14 =>  -32,    # A084236
+      2**15 =>   26,
+      2**16 =>   14,
+      2**17 =>  -20,
+      2**18 =>   24,
+      2**19 => -125,
+      2**20 =>  257,
+      2**21 => -362,
+      2**22 =>  228,
+      2**23 =>  -10,
+
+     10**8  => 1928,
+     10**9  => -222,
+#  1*10**10 => -33722,  # From Deleglise and Rivat
+#  2*10**10 => -48723,  # Too slow with current method
+  );
+}
 
 my %totients = (
      123456 => 41088, 
@@ -66,7 +112,8 @@ my %sigmak = (
 
 plan tests => 0 + 1
                 + 1 # Small Moebius
-                + 3*scalar(keys %mertens) + 3
+                + 3*scalar(keys %mertens)
+                + 1*scalar(keys %big_mertens)
                 + 2 # Small Phi
                 + scalar(keys %totients)
                 + scalar(keys %jordan_totients)
@@ -93,9 +140,9 @@ while (my($n, $mertens) = each (%mertens)) {
   # Now with mertens function
   is( mertens($n), $mertens, "mertens($n) == $mertens" );
 }
-is( mertens(  100000),  -48, "mertens(100000)" );
-is( mertens( 1000000),  212, "mertens(1000000)" );
-is( mertens(10000000), 1037, "mertens(10000000)" );
+while (my($n, $mertens) = each (%big_mertens)) {
+  is( mertens($n), $mertens, "mertens($n)" );
+}
 
 {
   my @phi = map { euler_phi($_) } (0 .. $#A000010);
diff --git a/util.c b/util.c
index 402106b..4bf6e4a 100644
--- a/util.c
+++ b/util.c
@@ -642,7 +642,6 @@ IV* _moebius_range(UV lo, UV hi)
    */
   range = hi-lo+1;
   sqrtn = (UV) (sqrt(hi) + 0.5);
-  /* sqrtn = (UV) sqrt(hi);  if (sqrtn*sqrtn < hi) sqrtn++; */  /* TODO */
 
   New(0, mu, range, IV);
   if (mu == 0)
@@ -705,6 +704,7 @@ IV _XS_mertens(UV n) {
   IV* M;
   IV sum;
 
+  if (n <= 1)  return n;
   u = (UV) sqrt(n);
   mu = _moebius_range(0, u);
   New(0, M, u+1, IV);
diff --git a/xt/moebius-mertens.pl b/xt/moebius-mertens.pl
new file mode 100755
index 0000000..b25226f
--- /dev/null
+++ b/xt/moebius-mertens.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+$| = 1;  # fast pipes
+
+use Math::Prime::Util qw/moebius mertens/;
+use List::Util qw/sum/;
+use Algorithm::Diff qw/diff/;
+
+my $limit = shift || 1_000_000;
+
+print "Calculating moebius from 1 to $limit...";
+my @mu = map { moebius($_) } 1 .. $limit;
+print "...";
+unshift @mu, 0;
+print "...done\n";
+
+while (1) {
+  my $beg = 1 + int(rand($limit));
+  my $end = 1 + int(rand($limit));
+  ($beg,$end) = ($end,$beg) if $beg > $end;
+
+  # Does moebius range return the same values?
+  my @mu_range = @mu[ $beg .. $end ];
+  my @mobius = moebius($beg,$end);
+
+  my $mu_sum = sum(@mu_range);
+  my $mo_sum = sum(@mobius);
+  my $mert_sum = mertens($end) - mertens($beg-1);
+  warn "\nbeg $beg  end $end  sum $mu_sum  range sum $mo_sum\n"
+       unless $mu_sum == $mo_sum;
+  warn "\nbeg $beg  end $end  sum $mu_sum  mertsum $mert_sum\n"
+       unless $mu_sum == $mert_sum;
+  print ".";
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git