[libmath-prime-util-perl] 38/50: Comments and a small speedup for Lehmer
Partha P. Mukherjee
ppm-guest at moszumanska.debian.org
Thu May 21 18:45:38 UTC 2015
This is an automated email from the git hooks/post-receive script.
ppm-guest pushed a commit to annotated tag v0.13
in repository libmath-prime-util-perl.
commit 3b884098fe37e7793d923f0975ebd9918df4961a
Author: Dana Jacobsen <dana at acm.org>
Date: Fri Nov 16 03:49:37 2012 -0800
Comments and a small speedup for Lehmer
---
lehmer.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 184 insertions(+), 26 deletions(-)
diff --git a/lehmer.c b/lehmer.c
index 5c82ba1..4977c0f 100644
--- a/lehmer.c
+++ b/lehmer.c
@@ -8,8 +8,71 @@
#include "cache.h"
#include "sieve.h"
+/*****************************************************************************
+ *
+ * Lehmer prime counting utility. Calculates pi(x), count of primes <= x.
+ *
+ * Copyright (c) 2012 Dana Jacobsen (dana at acm.org).
+ * This is free software; you can redistribute it and/or modify it under
+ * the same terms as the Perl 5 programming language system itself.
+ *
+ * This file is part of the Math::Prime::Util Perl module. It relies on two
+ * features: (1) a sieve to generate very small primes which could be as
+ * simple as a three-line SoE, and (2) _XS_prime_count(low, high) is expected
+ * to return the prime count within the segment low to high inclusive. These
+ * are used for the relatively small segments in the final stage, but making
+ * it fast is important for the final performance. The primesieve package is
+ * one source of excellent routines for either task.
+ *
+ * The phi(x,a) calculation is unique, to the best of my knowledge. It keeps
+ * a heap of all x values + signed counts for the given 'a' value, and walks
+ * 'a' down until it is small enough to calculate directly (either with Mapes
+ * or using a calculated table using the primorial/totient method). This
+ * is relatively fast and low memory compared to many other solutions. As with
+ * all Lehmer-Meissel-Legendre algorithms, memory use will be a constraint
+ * with large values of x.
+ *
+ * Calculating pi(10^11) is done in under 1 second on my computer. pi(10^14)
+ * takes about 1 minute, pi(10^16) in under an hour. Compared to Thomas
+ * R. Nicely's pix4 program, this one is about 3-4x faster and uses 3x less
+ * memory. However those comparisons were done without the big pre-computed
+ * prime count tables pix4 can use to speed up the final stage.
+ *
+ * n phi(x,a) mem/time | stage 4 mem/time | total time
+ * 10^17 5988MB 1244.11 | 2688MB
+ * 10^16 1877MB 254.83 | 945MB 1478.9 | 28m 51.4s
+ * 10^15 754MB 50.50 | 392MB 224.2 | 4m 33.8s
+ * 10^14 260MB 9.879 | 217MB 38.06 | 47.83s
+ * 10^13 1.852 | 162MB 6.987 | 8.864s
+ * 10^12 0.363 | 1.358 | 1.768s
+ * 10^11 0.064 | 0.276 | 0.381s
+ * 10^10 xxxxxMB 0.011 | xxxxxMB 0.056 | 0.105s
+ *
+ * Reference: "Prime numbers and computer methods for factorization", Riesel.
+ */
+
static int const verbose = 0;
+#define DO_TIMING 0
+#ifdef DO_TIMING
+ #include <sys/time.h>
+ #define DECLARE_TIMING_VARIABLES struct timeval t0, t1;
+ #define TIMING_START gettimeofday(&t0, 0);
+ #define TIMING_END_PRINT(text) \
+ { unsigned long long t; \
+ gettimeofday(&t1, 0); \
+ t = (t1.tv_sec-t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec); \
+ printf("%s: %10.5f\n", text, ((double)t) / 1000000); }
+#else
+ #define DECLARE_TIMING_VARIABLES
+ #define TIMING_START
+ #define TIMING_END_PRINT(text)
+#endif
+/* Use Mapes' method to calculate phi(x,a) for small a. This is really
+ * convenient and a little Perl script will spit this code out for whatever
+ * limit we select. It gets unwieldy however, and using the primorial/totient
+ * method looks faster for the final output.
+ */
static IV mapes(IV x, UV a)
{
IV val;
@@ -24,6 +87,12 @@ static IV mapes(IV x, UV a)
return val;
}
+/******************************************************************************/
+/* Modified heap for manipulating our UV value / IV count pairs */
+/******************************************************************************/
+
+/* TODO: This should be cleaned up */
+
/* heap of values and counts, stored by value */
typedef struct {
UV v;
@@ -34,15 +103,30 @@ typedef struct {
vc_t* array;
UV array_size;
UV N;
+ UV small_limit;
+ IV* small_array;
+ UV Nsmall;
+ int ptr_small;
} heap_t;
static void heap_insert(heap_t* h, vc_t v)
{
UV k;
+ UV val = v.v;
+ if (v.c == 0)
+ return;
+ if (val < h->small_limit) {
+ if (h->small_array[val] == 0) ++(h->Nsmall);
+ h->small_array[val] += v.c;
+ if (h->small_array[val] == 0) --(h->Nsmall);
+ else if (h->ptr_small < (int)val) h->ptr_small = (int) val;
+// printf("small array insert %lu count %ld, coalesce to %ld, Nsmall = %lu, ptr %d\n", v.v, v.c, h->small_array[val], h->Nsmall, h->ptr_small);
+ return;
+ }
UV n = ++(h->N);
vc_t* a = h->array;
if (n >= h->array_size) {
- UV new_size = (h->array_size == 0) ? 20000 : 2 * h->array_size;
+ UV new_size = (h->array_size == 0) ? 20000 : 1.5 * h->array_size;
if (verbose>2) printf("REALLOCing %p, new size %lu\n", h->array, new_size);
h->array = realloc( h->array, new_size * sizeof(vc_t) );
if (h->array == 0)
@@ -84,7 +168,24 @@ static vc_t heap_remove(heap_t* h)
}
static vc_t heap_remove_coalesce(heap_t* h)
{
- vc_t v = heap_remove(h);
+ vc_t v;
+ if (h->N == 0) {
+ int i;
+ if (h->Nsmall == 0) croak("remove from empty heap\n");
+ for (i = h->ptr_small; i >= 0; i--) {
+ if (h->small_array[i] != 0) {
+ v.v = (UV) i;
+ v.c = h->small_array[i];
+ h->small_array[i] = 0;
+ h->ptr_small = i-1;
+ --(h->Nsmall);
+ //printf("small array returning %lu count %ld\n", v.v, v.c);
+ return v;
+ }
+ }
+ croak("walked off small array\n");
+ }
+ v = heap_remove(h);
/* get rest of entries with same value */
while (h->N > 0 && h->array[1].v == v.v) {
v.c += h->array[1].c;
@@ -92,12 +193,20 @@ static vc_t heap_remove_coalesce(heap_t* h)
}
return v;
}
-static heap_t heap_create(void)
+static heap_t heap_create(UV small_size)
{
heap_t h;
h.array = 0;
h.array_size = 0;
h.N = 0;
+ h.small_limit = small_size;
+ h.small_array = 0;
+ if (small_size > 0) {
+ if (verbose>1)printf("creating small array of size %lu\n", small_size);
+ h.small_array = (IV*) calloc( small_size, sizeof(IV) );
+ }
+ h.Nsmall = 0;
+ h.ptr_small = -1;
return h;
}
static void heap_destroy(heap_t* h)
@@ -109,34 +218,54 @@ static void heap_destroy(heap_t* h)
h->array = 0;
h->array_size = 0;
h->N = 0;
+ if (h->small_array != 0) {
+ free(h->small_array);
+ }
+ h->small_array = 0;
+ h->Nsmall = 0;
+ h->ptr_small = -1;
}
static void heap_empty(heap_t* h)
{
h->N = 0;
+ h->Nsmall = 0;
+ h->ptr_small = -1;
+ if (h->small_limit > 0)
+ memset(h->small_array, 0, h->small_limit * sizeof(IV));
}
+#define heap_not_empty(h) ((h).N > 0 || (h).Nsmall > 0)
+
+/******************************************************************************/
+
-static UV phi(UV x, UV a)
+
+/*
+ * The main phi(x,a) algorithm. In this implementation, it takes about 25%
+ * of the total time for the Lehmer algorithm, but it is by far the most memory
+ * consuming section.
+ */
+
+static UV phi(UV x, UV a, UV* primes)
{
UV i;
- IV primea;
IV sum = 0;
- if (x < _XS_nth_prime(a+1)) return (x > 0) ? 1 : 0;
- if (a == 1) return ((x+1)/2);
- if (a <= 7) return mapes(x, a);
+ UV primea = primes ? primes[a+1] : _XS_nth_prime(a+1);
+ if (x < primea) return (x > 0) ? 1 : 0;
+ if (a == 1) return ((x+1)/2);
+ if (a <= 7) return mapes(x, a);
- primea = _XS_nth_prime(a);
+ primea = primes ? primes[a] : _XS_prev_prime(primea);
- heap_t h1 = heap_create();
- heap_t h2 = heap_create();
+ heap_t h1 = heap_create(20 * x/primea/primea/primea);
+ heap_t h2 = heap_create(20 * x/primea/primea/primea);
vc_t v;
v.v = x; v.c = 1;
heap_insert(&h1, v);
- while (a > 5) {
+ while (a > 6) {
heap_empty(&h2);
- /* Walk heap */
- while (h1.N > 0) {
+ while ( heap_not_empty(h1) ) {
UV sval;
v = heap_remove_coalesce(&h1);
if (v.c == 0)
@@ -153,10 +282,10 @@ static UV phi(UV x, UV a)
}
{ heap_t t = h1; h1 = h2; h2 = t; }
a--;
- primea = _XS_prev_prime(primea);
+ primea = primes ? primes[a] : _XS_prev_prime(primea);
}
heap_destroy(&h2);
- while (h1.N > 0) {
+ while ( heap_not_empty(h1) ) {
v = heap_remove_coalesce(&h1);
if (v.c != 0)
sum += v.c * mapes(v.v, a); /* This could be faster */
@@ -165,6 +294,7 @@ static UV phi(UV x, UV a)
return sum;
}
+
/* Legendre's method. Interesting and a good test for phi(x,a), but Lehmer's
* method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */
UV _XS_legendre_pi(UV n)
@@ -174,29 +304,32 @@ UV _XS_legendre_pi(UV n)
UV a = _XS_legendre_pi(sqrt(n));
prime_precalc(a);
- return phi(n, a) + a - 1;
+ return phi(n, a, 0) + a - 1;
}
+/* Lehmer's method. See Riesel for basic program. */
+
UV _XS_lehmer_pi(UV n)
{
+ DECLARE_TIMING_VARIABLES;
UV z, a, b, c, sum, i, j, lastw, lastwpc;
UV* primea = 0; /* small prime cache, first b=pi(z)=pi(sqrt(n)) primes */
UV* pca = 0; /* small prime count cache, first z=sqrt(n) numbers */
if (n < 1000000)
return _XS_prime_count(2, n);
- if (verbose > 0) printf("lehmer stage 1 - %lu\n", n);
+ if (verbose > 0) printf("lehmer %lu stage 1: calculate a,b,c \n", n);
+ TIMING_START;
z = sqrt((double)n+0.5);
a = _XS_lehmer_pi(sqrt((double)z)+0.5);
b = _XS_lehmer_pi(z);
c = _XS_lehmer_pi(pow((double)n, 1.0/3.0)+0.5);
- if (verbose > 0) printf("lehmer stage 2 - %lu (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
- sum = phi(n, a) + ( (b+a-2) * (b-a+1) / 2);
+ TIMING_END_PRINT("stage 1")
- if (verbose > 0) printf("lehmer stage 3 - n=%lu a=%lu\n", n, a);
- /* Calculate the first b primes and first z primecounts */
- {
- UV count = 0;
+
+ if (verbose > 0) printf("lehmer %lu stage 2: %lu small primes, %lu counts\n", n, b, z);
+ TIMING_START;
+ { /* Calculate the first b primes */
const unsigned char* sieve;
if (get_prime_cache(z, &sieve) < z) {
release_prime_cache(sieve);
@@ -218,7 +351,19 @@ UV _XS_lehmer_pi(UV n)
if (i < b)
croak("Did not generate enough small primes. Bug in Lehmer code.\n");
if (verbose > 1) printf("generated first %lu small primes, from 2 to %lu\n", i, z);
+ }
+ TIMING_END_PRINT("small primes")
+
+
+ if (verbose > 0) printf("lehmer %lu stage 3: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
+ TIMING_START;
+ sum = phi(n, a, primea) + ( (b+a-2) * (b-a+1) / 2);
+ TIMING_END_PRINT("phi(x,a)")
+
+ TIMING_START;
+ { /* Generate primecounts up to z used by stage 4. */
+ UV count = 0;
pca = (UV*) malloc( (z+4) * sizeof(UV) );
if (pca == 0)
croak("Can not allocate small prime counts\n");
@@ -229,12 +374,24 @@ UV _XS_lehmer_pi(UV n)
}
if (verbose > 1) printf("generated first %lu prime counts, from 2 to %lu\n", count, z);
}
+ TIMING_END_PRINT("prime counts")
+printf("precalculated counts to %lu\n", z);
- /* Ensure we have the base sieve for prime_count ( n/primea[i] ) */
+ /* Ensure we have the base sieve for prime_count ( n/primea[i] ). */
+ /* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */
prime_precalc( sqrt( n / primea[a+1] ) );
+ /* TODO:
+ * (1) generate 2b primes instead of b
+ * (2) generate counts for all 2b primes
+ * (3) store counts on odd numbers only.
+ * (4) in loop below, check if we have the result in pca and use it.
+ * This will cut in half the prime_count calls below, at a little memory
+ * cost (almost entirely mitigated by change #3)
+ */
- if (verbose > 0) printf("lehmer stage 4 - n=%lu a=%lu\n", n, a);
+ if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primea[a+1]);
+ TIMING_START;
/* Reverse the i loop so w increases. Count w in segments. */
lastw = 0;
lastwpc = 0;
@@ -250,6 +407,7 @@ UV _XS_lehmer_pi(UV n)
}
}
}
+ TIMING_END_PRINT("stage 4")
if (pca != 0)
free(pca);
if (primea != 0)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git
More information about the Pkg-perl-cvs-commits
mailing list