[libmath-prime-util-perl] 13/181: Documentation updates; popcnt updates for LMO
Partha P. Mukherjee
ppm-guest at moszumanska.debian.org
Thu May 21 18:51:01 UTC 2015
This is an automated email from the git hooks/post-receive script.
ppm-guest pushed a commit to annotated tag v0.36
in repository libmath-prime-util-perl.
commit db3e43acdfd1d1ee06fe9284c9f18d896d171e36
Author: Dana Jacobsen <dana at acm.org>
Date: Sun Dec 15 02:00:04 2013 -0800
Documentation updates; popcnt updates for LMO
---
lehmer.c | 49 ++++++++++++------------------------
lmo.c | 88 +++++++++++++++++++++++++++++++++++++---------------------------
2 files changed, 67 insertions(+), 70 deletions(-)
diff --git a/lehmer.c b/lehmer.c
index 1425a24..89edbc7 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -5,7 +5,7 @@
/* Below this size, just sieve (with table speedup). */
#define SIEVE_LIMIT 60000000
-#define MAX_PHI_MEM (256*1024*1024)
+#define MAX_PHI_MEM (896*1024*1024)
/*****************************************************************************
*
@@ -31,40 +31,23 @@
* As with all Lehmer-Meissel-Legendre algorithms, memory use will be a
* constraint with large values of x (see the table below).
*
- * If you want something better, I highly recommend the paper "Computing
- * Pi(x): the combinatorial method" (2006) by Tomás Oliveira e Silva. His
- * implementation is certainly much faster and lower memory than this. I have
- * briefly run Christian Bau's LMO implementation which has the big advantage
- * of using almost no memory. On the same machine that ran the times below,
- * I got 10^16 in 36s, 10^17 in 165s, 10^18 in 769s, 10^19 in 4162s. All
- * using a single core. That's 10-50x faster than this code.
+ * Math::Prime::Util now includes an extended LMO implementation, which will
+ * be quite a bit faster and much less memory than this code. It is the
+ * default method for large counts. Timing comparisons are in that file.
*
- * Using my sieve code with everything running in serial, calculating pi(10^12)
- * is done under 1 second on my computer. pi(10^14) takes under 30 seconds,
- * pi(10^16) in under 20 minutes. Compared with Thomas R. Nicely's pix4
- * program, this one is 5x faster and uses 10x less memory. When compiled
- * with parallel primesieve it is over 10x faster.
- * pix4(10^16) takes 124 minutes, this code + primesieve takes < 4 minutes.
+ * Times and memory use for prime_count(10^15) on a Haswell 4770K:
*
- * Timings with Perl + MPU with all-serial computation, no memory limit.
- * The last column is the standalone time with 12-core parallel primesieve.
- *
- * n phi(x,a) mem/time | stage 4 mem/time | total time | pps time
- * 10^19 17884MB 1979.41 | 9144MB | | 7h 26m
- * 10^18 5515MB 483.46 | 2394MB | | 87m 0s
- * 10^17 1698MB 109.56 | 634MB 9684.1 | 163m 36 s | 17m 45s
- * 10^16 522MB 24.14 | 171MB 1066.3 | 18m 12 s | 3m 44s
- * 10^15 159MB 5.58 | 47MB 142.5 | 2m 28 s | 47.66 s
- * 10^14 48MB 1.28 | 13MB 22.26 | 23.61 s | 10.25 s
- * 10^13 14MB 0.294 | 4MB 3.82 | 4.12 s | 2.21 s
- * 10^12 4MB 0.070 | 1MB 0.707 | 0.78 s | 0.459s
- * 10^11 1MB 0.015 | 0.135 | 0.158s | 0.097s
- * 10^10 0.003 | 0.029 | 0.028s | 0.025s
- *
- * Using the standalone program with parallel primesieve speeds up stage 4
- * a lot for large values, as can be seen by the last column. It does use
- * quite a bit more memory in stage 4 however (lowering SIEVE_MULT can reduce
- * it by as much as 10x, at the expense of some performance).
+ * 4.80s 1.3MB LMO
+ * 27.51s* 136.0MB Lehmer Walisch primecount v0.7, 8 threads
+ * 42.99s* 159.4MB Lehmer standalone, 8 threads
+ * 50.82s* 136.2MB Meissel Walisch primecount v0.7, 8 threads
+ * 64.06s* 144.3MB Legendre Walisch primecount v0.7, 8 threads
+ * 114.40s 28.4MB Lehmer Walisch primecount v0.7, 1 thread
+ * 118.69s 27.1MB LMOS
+ * 120.39s 158.4MB Lehmer
+ * 112.40s 286.6MB Meissel
+ * 303.77s 28.0MB Legendre
+ * 868.96s 1668.1MB Lehmer pix4 by T.R. Nicely
*
* Reference: Hans Riesel, "Prime Numbers and Computer Methods for
* Factorization", 2nd edition, 1994.
diff --git a/lmo.c b/lmo.c
index fc85ca5..0e2111b 100644
--- a/lmo.c
+++ b/lmo.c
@@ -27,29 +27,30 @@
*
* Sieve: Segmented, single threaded, thread-safe. Small table enhanced,
* fastest for n < 60M. Bad growth rate (like all sieves will have).
- * Legendre:Combinatorial phi. Simple implementation.
- * Meissel: Combinatorial phi. Simple implementation.
- * Lehmer: Combinatorial phi. Memory use grows rapidly.
- * LMOS: Combinatorial phi. Basic LMO implementation.
- * LMO: Sieve phi. 10-50x faster than LMOS, better growth rate,
- * Much, much better memory use than the others.
+ * Legendre:Simple. Recursive caching phi.
+ * Meissel: Simple. Non-recursive phi, lots of memory.
+ * Lehmer: Non-recursive phi, tries to restrict memory.
+ * LMOS: Simple. Non-recursive phi, less memory than Lehmer above.
+ * LMO: Sieve phi. Much faster and less memory than the others.
*
- * Performance is slightly worse than Christian Bau's implementation, but
- * in theory this implementation has more parallelization opportunities.
- * Timing below is single core using MPU.
+ * Timing below is single core Haswell 4770K using Math::Prime::Util.
*
- * | n | Meissel | Lehmer | LMOS | LMO |
- * +-------+----------+----------+----------+-----------+
- * | 10^19 | | | | 3765.02 |
- * | 10^18 | | | | 778.21 |
- * | 10^17 | | | 8844.2 | 163.77 |
- * | 10^16 | 1410.2 | 1043.6 | 1058.9 | 34.81 |
- * | 10^15 | 137.1 | 137.3 | 142.7 | 7.905 |
- * | 10^14 | 26.18 | 21.74 | 21.29 | 1.726 |
- * | 10^13 | 5.155 | 3.947 | 3.353 | 0.405 |
- * | 10^12 | 1.059 | 0.700 | 0.626 | 0.0936 |
- * | 10^11 | 0.227 | 0.138 | 0.124 | 0.0227 |
- * | 10^10 | 0.0509| 0.0309| 0.0286| 0.00589|
+ * | n | Legendre | Meissel | Lehmer | LMOS | LMO |
+ * +-------+----------+----------+----------+----------+-----------+
+ * | 10^19 | | | | | 2493.4 |
+ * | 10^18 | | | | | 498.16 |
+ * | 10^17 | | 2950.9 | 7499.0 | 7125.7 | 103.03 |
+ * | 10^16 | 2422.7 | 575.8 | 872.5 | 884.9 | 21.94 |
+ * | 10^15 | 303.3 | 112.1 | 119.3 | 119.1 | 4.786 |
+ * | 10^14 | 40.10 | 22.06 | 19.06 | 17.58 | 1.052 |
+ * | 10^13 | 5.678 | 4.330 | 3.316 | 2.810 | 0.237 |
+ * | 10^12 | 0.901 | 0.889 | 0.617 | 0.524 | 54.9ms |
+ * | 10^11 | 0.182 | 0.192 | 0.122 | 0.114 | 13.80ms|
+ * | 10^10 | 40.2ms| 41.7ms| 26.6ms| 25.6ms| 3.64ms|
+ *
+ * Run with high memory limits: Meissel uses 1GB for 10^16, ~3GB for 10^17.
+ * Lehmer is limited at high n values by sieving speed. It is much faster
+ * using parallel primesieve, though cannot come close to LMO.
*/
/* Below this size, just sieve (with table speedup). */
@@ -77,26 +78,27 @@
typedef uint32_t uint32;
#endif
+/* UV is either uint32 or uint64 depending on Perl. We use this native size
+ * for the basic unit of the phi sieve. It can be easily overridden here. */
typedef UV sword_t;
#define SWORD_BITS BITS_PER_WORD
#define SWORD_ONES UV_MAX
#define SWORD_MASKBIT(bits) (UVCONST(1) << ((bits) % SWORD_BITS))
#define SWORD_CLEAR(s,bits) s[bits/SWORD_BITS] &= ~SWORD_MASKBIT(bits)
-#if defined(__GNUC__) && defined(__SSE4_2__)
-static sword_t bitcount(sword_t b) {
- sword_t ret;
- __asm__("popcnt %1, %0" : "=r" (ret) : "r" (b));
- return ret;
-}
-#elif SWORD_BITS == 64
-static sword_t bitcount(sword_t b) {
- b -= (b >> 1) & 0x5555555555555555;
- b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
- b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f;
- return (b * 0x0101010101010101)>>56;
-}
-#else /* Faster than __builtin_popcount */
+/* Compile with -march=native to get a large speedup on Nahalem and newer */
+#if SWORD_BITS == 64
+ #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR > 1))
+ #define bitcount(b) __builtin_popcountll(b)
+ #else
+ static sword_t bitcount(sword_t b) {
+ b -= (b >> 1) & 0x5555555555555555;
+ b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
+ b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f;
+ return (b * 0x0101010101010101)>>56;
+ }
+ #endif
+#else
static const unsigned char byte_ones[256] =
{0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
@@ -124,15 +126,27 @@ static uint32_t* make_primelist(uint32 n, uint32* number_of_primes)
: (n/log(n)) * (1.0+1.0/logn+2.51/(logn*logn));
*number_of_primes = 0;
New(0, plist, max_index+1, uint32_t);
- if (plist == 0)
- croak("Can not allocate small primes\n");
+ if (plist == 0) croak("Can not allocate small primes\n");
plist[0] = 0;
+ /* We could do a simple SoE here. This is not time critical. */
START_DO_FOR_EACH_PRIME(2, n) {
plist[++i] = p;
} END_DO_FOR_EACH_PRIME;
*number_of_primes = i;
return plist;
}
+#if 0 /* primesieve 5.0 example */
+#include <primesieve.h>
+static uint32_t* make_primelist(uint32 n, uint32* number_of_primes) {
+ uint32_t plist;
+ uint32_t* psprimes = generate_primes(2, n, number_of_primes, UINT_PRIMES);
+ New(0, plist, *number_of_primes + 1, uint32_t);
+ plist[0] = 0;
+ memcpy(plist+1, psprimes, *number_of_primes * sizeof(uint32_t));
+ primesieve_free(psprimes);
+ return plist;
+}
+#endif
/* Given a max prime in small prime list, return max prev prime input */
static uint32 prev_sieve_max(UV maxprime) {
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git
More information about the Pkg-perl-cvs-commits
mailing list