[libmath-prime-util-perl] 13/181: Documentation updates; popcnt updates for LMO

Thu May 21 18:51:01 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.36
in repository libmath-prime-util-perl.

commit db3e43acdfd1d1ee06fe9284c9f18d896d171e36
Author: Dana Jacobsen <dana at acm.org>
Date:   Sun Dec 15 02:00:04 2013 -0800

    Documentation updates; popcnt updates for LMO
---
 lehmer.c | 49 ++++++++++++------------------------
 lmo.c    | 88 +++++++++++++++++++++++++++++++++++++---------------------------
 2 files changed, 67 insertions(+), 70 deletions(-)

diff --git a/lehmer.c b/lehmer.c
index 1425a24..89edbc7 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -5,7 +5,7 @@
 
 /* Below this size, just sieve (with table speedup). */
 #define SIEVE_LIMIT  60000000
-#define MAX_PHI_MEM  (256*1024*1024)
+#define MAX_PHI_MEM  (896*1024*1024)
 
 /*****************************************************************************
  *
@@ -31,40 +31,23 @@
  * As with all Lehmer-Meissel-Legendre algorithms, memory use will be a
  * constraint with large values of x (see the table below).
  *
- * If you want something better, I highly recommend the paper "Computing
- * Pi(x): the combinatorial method" (2006) by Tomás Oliveira e Silva.  His
- * implementation is certainly much faster and lower memory than this.  I have
- * briefly run Christian Bau's LMO implementation which has the big advantage
- * of using almost no memory.  On the same machine that ran the times below,
- * I got 10^16 in 36s, 10^17 in 165s, 10^18 in 769s, 10^19 in 4162s.  All
- * using a single core.  That's 10-50x faster than this code.
+ * Math::Prime::Util now includes an extended LMO implementation, which will
+ * be quite a bit faster and much less memory than this code.  It is the
+ * default method for large counts.  Timing comparisons are in that file.
  *
- * Using my sieve code with everything running in serial, calculating pi(10^12)
- * is done under 1 second on my computer.  pi(10^14) takes under 30 seconds,
- * pi(10^16) in under 20 minutes.  Compared with Thomas R. Nicely's pix4
- * program, this one is 5x faster and uses 10x less memory.  When compiled
- * with parallel primesieve it is over 10x faster.
- *   pix4(10^16) takes 124 minutes, this code + primesieve takes < 4 minutes.
+ * Times and memory use for prime_count(10^15) on a Haswell 4770K:
  *
- * Timings with Perl + MPU with all-serial computation, no memory limit.
- * The last column is the standalone time with 12-core parallel primesieve.
- *
- *    n     phi(x,a) mem/time  |  stage 4 mem/time  | total time | pps time
- *   10^19  17884MB   1979.41  | 9144MB             |            | 7h 26m
- *   10^18   5515MB    483.46  | 2394MB             |            | 87m  0s
- *   10^17   1698MB    109.56  |  634MB   9684.1    | 163m 36  s | 17m 45s
- *   10^16    522MB     24.14  |  171MB   1066.3    |  18m 12  s |  3m 44s
- *   10^15    159MB      5.58  |   47MB    142.5    |   2m 28  s |   47.66 s
- *   10^14     48MB      1.28  |   13MB     22.26   |    23.61 s |   10.25 s
- *   10^13     14MB      0.294 |    4MB      3.82   |     4.12 s |    2.21 s
- *   10^12      4MB      0.070 |    1MB      0.707  |     0.78 s |    0.459s
- *   10^11      1MB      0.015 |             0.135  |     0.158s |    0.097s
- *   10^10               0.003 |             0.029  |     0.028s |    0.025s
- *
- * Using the standalone program with parallel primesieve speeds up stage 4
- * a lot for large values, as can be seen by the last column.  It does use
- * quite a bit more memory in stage 4 however (lowering SIEVE_MULT can reduce
- * it by as much as 10x, at the expense of some performance).
+ *       4.80s    1.3MB    LMO
+ *      27.51s* 136.0MB    Lehmer    Walisch primecount v0.7, 8 threads
+ *      42.99s* 159.4MB    Lehmer    standalone, 8 threads
+ *      50.82s* 136.2MB    Meissel   Walisch primecount v0.7, 8 threads
+ *      64.06s* 144.3MB    Legendre  Walisch primecount v0.7, 8 threads
+ *     114.40s   28.4MB    Lehmer    Walisch primecount v0.7, 1 thread
+ *     118.69s   27.1MB    LMOS
+ *     120.39s  158.4MB    Lehmer
+ *     112.40s  286.6MB    Meissel
+ *     303.77s   28.0MB    Legendre
+ *     868.96s 1668.1MB    Lehmer    pix4 by T.R. Nicely
  *
  * Reference: Hans Riesel, "Prime Numbers and Computer Methods for
  * Factorization", 2nd edition, 1994.
diff --git a/lmo.c b/lmo.c
index fc85ca5..0e2111b 100644
--- a/lmo.c
+++ b/lmo.c
@@ -27,29 +27,30 @@
  *
  * Sieve:   Segmented, single threaded, thread-safe.  Small table enhanced,
  *          fastest for n < 60M.  Bad growth rate (like all sieves will have).
- * Legendre:Combinatorial phi.  Simple implementation.
- * Meissel: Combinatorial phi.  Simple implementation.
- * Lehmer:  Combinatorial phi.  Memory use grows rapidly.
- * LMOS:    Combinatorial phi.  Basic LMO implementation.
- * LMO:     Sieve phi.  10-50x faster than LMOS, better growth rate,
- *          Much, much better memory use than the others.
+ * Legendre:Simple.  Recursive caching phi.
+ * Meissel: Simple.  Non-recursive phi, lots of memory.
+ * Lehmer:  Non-recursive phi, tries to restrict memory.
+ * LMOS:    Simple.  Non-recursive phi, less memory than Lehmer above.
+ * LMO:     Sieve phi.  Much faster and less memory than the others.
  *
- * Performance is slightly worse than Christian Bau's implementation, but
- * in theory this implementation has more parallelization opportunities.
- * Timing below is single core using MPU.
+ * Timing below is single core Haswell 4770K using Math::Prime::Util.
  *
- *  |   n   |  Meissel |  Lehmer  |   LMOS   |    LMO    |
- *  +-------+----------+----------+----------+-----------+
- *  | 10^19 |          |          |          | 3765.02   |
- *  | 10^18 |          |          |          |  778.21   |
- *  | 10^17 |          |          | 8844.2   |  163.77   |
- *  | 10^16 | 1410.2   | 1043.6   | 1058.9   |   34.81   |
- *  | 10^15 |  137.1   |  137.3   |  142.7   |    7.905  |
- *  | 10^14 |   26.18  |   21.74  |   21.29  |    1.726  |
- *  | 10^13 |    5.155 |    3.947 |    3.353 |    0.405  |
- *  | 10^12 |    1.059 |    0.700 |    0.626 |    0.0936 |
- *  | 10^11 |    0.227 |    0.138 |    0.124 |    0.0227 |
- *  | 10^10 |    0.0509|    0.0309|    0.0286|    0.00589|
+ *  |   n   | Legendre |  Meissel |  Lehmer  |   LMOS   |    LMO    |
+ *  +-------+----------+----------+----------+----------+-----------+
+ *  | 10^19 |          |          |          |          | 2493.4    |
+ *  | 10^18 |          |          |          |          |  498.16   |
+ *  | 10^17 |          | 2950.9   | 7499.0   | 7125.7   |  103.03   |
+ *  | 10^16 | 2422.7   |  575.8   |  872.5   |  884.9   |   21.94   |
+ *  | 10^15 |  303.3   |  112.1   |  119.3   |  119.1   |    4.786  |
+ *  | 10^14 |   40.10  |   22.06  |   19.06  |   17.58  |    1.052  |
+ *  | 10^13 |    5.678 |    4.330 |    3.316 |    2.810 |    0.237  |
+ *  | 10^12 |    0.901 |    0.889 |    0.617 |    0.524 |    54.9ms |
+ *  | 10^11 |    0.182 |    0.192 |    0.122 |    0.114 |    13.80ms|
+ *  | 10^10 |    40.2ms|    41.7ms|    26.6ms|    25.6ms|     3.64ms|
+ *
+ * Run with high memory limits: Meissel uses 1GB for 10^16, ~3GB for 10^17.
+ * Lehmer is limited at high n values by sieving speed.  It is much faster
+ * using parallel primesieve, though cannot come close to LMO.
  */
 
 /* Below this size, just sieve (with table speedup). */
@@ -77,26 +78,27 @@
   typedef uint32_t       uint32;
 #endif
 
+/* UV is either uint32 or uint64 depending on Perl.  We use this native size
+ * for the basic unit of the phi sieve.  It can be easily overridden here. */
 typedef  UV  sword_t;
 #define SWORD_BITS  BITS_PER_WORD
 #define SWORD_ONES  UV_MAX
 #define SWORD_MASKBIT(bits)  (UVCONST(1) << ((bits) % SWORD_BITS))
 #define SWORD_CLEAR(s,bits)  s[bits/SWORD_BITS] &= ~SWORD_MASKBIT(bits)
 
-#if defined(__GNUC__) && defined(__SSE4_2__)
-static sword_t bitcount(sword_t b) {
-  sword_t ret;
-  __asm__("popcnt %1, %0" : "=r" (ret) : "r" (b));
-  return ret;
-}
-#elif SWORD_BITS == 64
-static sword_t bitcount(sword_t b) {
-  b -= (b >> 1) & 0x5555555555555555;
-  b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
-  b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f;
-  return (b * 0x0101010101010101)>>56;
-}
-#else   /* Faster than __builtin_popcount */
+/* Compile with -march=native to get a large speedup on Nahalem and newer */
+#if SWORD_BITS == 64
+ #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR > 1))
+   #define bitcount(b)  __builtin_popcountll(b)
+ #else
+   static sword_t bitcount(sword_t b) {
+     b -= (b >> 1) & 0x5555555555555555;
+     b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
+     b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f;
+     return (b * 0x0101010101010101)>>56;
+   }
+ #endif
+#else
 static const unsigned char byte_ones[256] =
   {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
@@ -124,15 +126,27 @@ static uint32_t* make_primelist(uint32 n, uint32* number_of_primes)
                    : (n/log(n)) * (1.0+1.0/logn+2.51/(logn*logn));
   *number_of_primes = 0;
   New(0, plist, max_index+1, uint32_t);
-  if (plist == 0)
-    croak("Can not allocate small primes\n");
+  if (plist == 0)  croak("Can not allocate small primes\n");
   plist[0] = 0;
+  /* We could do a simple SoE here.  This is not time critical. */
   START_DO_FOR_EACH_PRIME(2, n) {
     plist[++i] = p;
   } END_DO_FOR_EACH_PRIME;
   *number_of_primes = i;
   return plist;
 }
+#if 0  /* primesieve 5.0 example */
+#include <primesieve.h>
+static uint32_t* make_primelist(uint32 n, uint32* number_of_primes) {
+  uint32_t plist;
+  uint32_t* psprimes = generate_primes(2, n, number_of_primes, UINT_PRIMES);
+  New(0, plist, *number_of_primes + 1, uint32_t);
+  plist[0] = 0;
+  memcpy(plist+1, psprimes, *number_of_primes * sizeof(uint32_t));
+  primesieve_free(psprimes);
+  return plist;
+}
+#endif
 
 /* Given a max prime in small prime list, return max prev prime input */
 static uint32 prev_sieve_max(UV maxprime) {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git