[libmath-prime-util-perl] 10/16: Improve threading

Thu May 21 18:44:35 UTC 2015

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.08
in repository libmath-prime-util-perl.

commit e5857a6430a1d97591e95f109a3c74f5f6bfce8e
Author: Dana Jacobsen <dana at acm.org>
Date:   Wed Jun 20 19:17:27 2012 -0600

    Improve threading
---
 XS.xs                  |   2 +-
 cache.c                | 131 ++++++++++++++++++++++++++++++-------------------
 cache.h                |   7 ++-
 lib/Math/Prime/Util.pm |  24 ++++-----
 t/31-threading.t       |  78 ++++++++++++++++++++++-------
 5 files changed, 159 insertions(+), 83 deletions(-)

diff --git a/XS.xs b/XS.xs
index 929a76d..1e0e0f2 100644
--- a/XS.xs
+++ b/XS.xs
@@ -257,7 +257,7 @@ factor(IN UV n)
             if (!split_success) {
               split_success = holf_factor(n, factor_stack+nstack, 2000)-1;
             }
-            /* Very, very few numbers make it past here */
+            /* Less than 0.00003% of numbers make it past here. */
           }
           if (split_success) {
             MPUassert( split_success == 1, "split factor returned more than 2 factors");
diff --git a/cache.c b/cache.c
index bacc5c8..aecdc44 100644
--- a/cache.c
+++ b/cache.c
@@ -36,84 +36,118 @@ static int        primary_number_of_readers = 0;
 static unsigned char* prime_cache_sieve = 0;
 static UV             prime_cache_size = 0;
 
-/* Fill the primary cache up to n */
-static void _fill_prime_cache(UV n, int nowait) {
-  if (!nowait) { MUTEX_LOCK(&primary_mutex_no_waiting); }
-  MUTEX_LOCK(&primary_mutex_no_accessing);
-  if (!nowait) { MUTEX_UNLOCK(&primary_mutex_no_waiting); }
-  if (prime_cache_size < n) {
-    if (prime_cache_sieve != 0)
-      Safefree(prime_cache_sieve);
-    prime_cache_sieve = 0;
-    prime_cache_size = 0;
+/* To avoid thrashing, sieve a little farther than we absolutely need to. */
+#define FILL_EXTRA_N (128*30)
+
+/* Erase the primary cache and fill up to n. */
+static void _erase_and_fill_prime_cache(UV n) {
+  UV padded_n;
+  /* Note: You need to handle mutexes around this.
+   *   MUTEX_LOCK(&primary_mutex_no_waiting);
+   *   MUTEX_LOCK(&primary_mutex_no_accessing);
+   *   MUTEX_UNLOCK(&primary_mutex_no_waiting);
+   *   _fill_prime_cache(n);
+   *   MUTEX_UNLOCK(&primary_mutex_no_accessing);
+   */
 
-    /* Sieve a bit more than asked, to mitigate thrashing */
-    if (n >= (UV_MAX-3840))
-      n = UV_MAX;
-    else
-      n = ((n + 3840)/30)*30;
-    /* TODO: testing near 2^32-1 */
+  if (n >= (UV_MAX-FILL_EXTRA_N))
+    padded_n = UV_MAX;
+  else
+    padded_n = ((n + FILL_EXTRA_N)/30)*30;
 
-    prime_cache_sieve = sieve_erat30(n);
+  /* If new size isn't larger or smaller, then we're done. */
+  if (prime_cache_size == padded_n)
+    return;
 
+  if (prime_cache_sieve != 0)
+    Safefree(prime_cache_sieve);
+  prime_cache_sieve = 0;
+  prime_cache_size = 0;
+
+  if (n > 0) {
+    prime_cache_sieve = sieve_erat30(padded_n);
     if (prime_cache_sieve != 0)
-      prime_cache_size = n;
-    /* printf("size to %llu\n", prime_cache_size); fflush(stdout); */
+      prime_cache_size = padded_n;
   }
-  MUTEX_UNLOCK(&primary_mutex_no_accessing);
 }
 
 /*
  * Get the size and a pointer to the cached prime sieve.
- * Returns the maximum sieved value in the sieve.
+ * Returns the maximum sieved value available.
  * Allocates and sieves if needed.
  *
  * The sieve holds 30 numbers per byte, using a mod-30 wheel.
  */
 UV get_prime_cache(UV n, const unsigned char** sieve)
 {
+#ifdef USE_ITHREADS
   int prev_readers;
+  int i_hold_access_lock = 0;
 
   if (sieve == 0) {
     if (prime_cache_size < n) {
-      _fill_prime_cache(n, 0);
+      MUTEX_LOCK(&primary_mutex_no_waiting);
+      MUTEX_LOCK(&primary_mutex_no_accessing);
+      MUTEX_UNLOCK(&primary_mutex_no_waiting);
+      _erase_and_fill_prime_cache(n);
+      MUTEX_UNLOCK(&primary_mutex_no_accessing);
     }
     return prime_cache_size;
   }
 
-  if (prime_cache_size < n)
-    _fill_prime_cache(n, 0);
-
-  /* TODO: We've got a problem.  If another thread does a memfree right here,
-   * then we'll return a size less than n.  Everything technically works, but
-   * there will be sieve croaks because they can't get enough primes.
-   */
-
   MUTEX_LOCK(&primary_mutex_no_waiting);
+    /* If we need more primes, get the access lock right now */
+    if (prime_cache_size < n) {
+      MUTEX_LOCK(&primary_mutex_no_accessing);
+      i_hold_access_lock = 1;
+    }
+
     MUTEX_LOCK(&primary_mutex_counter);
       prev_readers = primary_number_of_readers;
       primary_number_of_readers++;
     MUTEX_UNLOCK(&primary_mutex_counter);
-    if (prev_readers == 0) { MUTEX_LOCK(&primary_mutex_no_accessing); }
+
+    if ( (prev_readers == 0) && (!i_hold_access_lock) ) {
+      MUTEX_LOCK(&primary_mutex_no_accessing);
+      i_hold_access_lock = 1;
+    }
+    if (prime_cache_size < n) {
+      _erase_and_fill_prime_cache(n);
+    }
   MUTEX_UNLOCK(&primary_mutex_no_waiting);
 
+  MPUassert(prime_cache_size >= n, "prime cache is too small!");
+
   *sieve = prime_cache_sieve;
   return prime_cache_size;
+#else
+  if (prime_cache_size < n)
+    _erase_and_fill_prime_cache(n);
+  if (sieve != 0)
+    *sieve = prime_cache_sieve;
+  return prime_cache_size;
+#endif
 }
+
 void release_prime_cache(const unsigned char* mem) {
+#ifdef USE_ITHREADS
   int current_readers;
   MUTEX_LOCK(&primary_mutex_counter);
     primary_number_of_readers--;
     current_readers = primary_number_of_readers;
   MUTEX_UNLOCK(&primary_mutex_counter);
   if (current_readers == 0) { MUTEX_UNLOCK(&primary_mutex_no_accessing); }
+#endif
 }
 
 
 
-#define SEGMENT_CHUNK_SIZE  UVCONST(256*1024*1024-8)
+/* The segment everyone is trying to share */
+#define PRIMARY_SEGMENT_CHUNK_SIZE    UVCONST(256*1024*1024-16)
 static unsigned char* prime_segment = 0;
 static int prime_segment_is_available = 1;
+/* If that's in use, malloc a new one of this size */
+#define SECONDARY_SEGMENT_CHUNK_SIZE  UVCONST( 64*1024*1024-16)
 
 unsigned char* get_prime_segment(UV *size) {
   unsigned char* mem;
@@ -133,13 +167,12 @@ unsigned char* get_prime_segment(UV *size) {
 
   if (use_prime_segment) {
     if (prime_segment == 0)
-      New(0, prime_segment, SEGMENT_CHUNK_SIZE, unsigned char);
-    *size = SEGMENT_CHUNK_SIZE;
+      New(0, prime_segment, PRIMARY_SEGMENT_CHUNK_SIZE, unsigned char);
+    *size = PRIMARY_SEGMENT_CHUNK_SIZE;
     mem = prime_segment;
   } else {
-    UV chunk_size = 64*1024*1024-8;
-    New(0, mem, chunk_size, unsigned char);
-    *size = chunk_size;
+    New(0, mem, SECONDARY_SEGMENT_CHUNK_SIZE, unsigned char);
+    *size = SECONDARY_SEGMENT_CHUNK_SIZE;
   }
 
   if (mem == 0)
@@ -147,6 +180,7 @@ unsigned char* get_prime_segment(UV *size) {
 
   return mem;
 }
+
 void release_prime_segment(unsigned char* mem) {
   MUTEX_LOCK(&segment_mutex);
     if (mem == prime_segment) {
@@ -159,6 +193,7 @@ void release_prime_segment(unsigned char* mem) {
 
 
 
+#define INITIAL_CACHE_SIZE ((1024-16)*30 - FILL_EXTRA_N)
 void prime_precalc(UV n)
 {
   if (!mutex_init) {
@@ -171,7 +206,7 @@ void prime_precalc(UV n)
 
   /* On initialization, make a few primes (2-30k using 1k memory) */
   if (n == 0)
-    n = (1024-16)*30;
+    n = INITIAL_CACHE_SIZE;
   get_prime_cache(n, 0);   /* Sieve to n */
 
   /* TODO: should we prealloc the segment here? */
@@ -180,18 +215,7 @@ void prime_precalc(UV n)
 
 void prime_memfree(void)
 {
-  MPUassert(mutex_init == 1, "segment mutex has not been initialized");
-
-  MUTEX_LOCK(&primary_mutex_no_waiting);
-  MUTEX_LOCK(&primary_mutex_no_accessing);
-  MUTEX_UNLOCK(&primary_mutex_no_waiting);
-  if ( (prime_cache_sieve != 0) ) {
-    /* printf("size to 0  nreaders: %d\n", primary_number_of_readers); fflush(stdout); */
-    Safefree(prime_cache_sieve);
-    prime_cache_sieve = 0;
-    prime_cache_size = 0;
-  }
-  MUTEX_UNLOCK(&primary_mutex_no_accessing);
+  MPUassert(mutex_init == 1, "cache mutexes have not been initialized");
 
   MUTEX_LOCK(&segment_mutex);
   /* Don't free if another thread is using it */
@@ -201,7 +225,12 @@ void prime_memfree(void)
   }
   MUTEX_UNLOCK(&segment_mutex);
 
-  prime_precalc(0);
+  MUTEX_LOCK(&primary_mutex_no_waiting);
+  MUTEX_LOCK(&primary_mutex_no_accessing);
+  MUTEX_UNLOCK(&primary_mutex_no_waiting);
+    /* Put primary cache back to initial state */
+    _erase_and_fill_prime_cache(INITIAL_CACHE_SIZE);
+  MUTEX_UNLOCK(&primary_mutex_no_accessing);
 }
 
 
diff --git a/cache.h b/cache.h
index 97592ca..9867276 100644
--- a/cache.h
+++ b/cache.h
@@ -9,8 +9,11 @@ extern void  prime_precalc(UV x);
   /* Release all extra memory -- go back to initial amounts */
 extern void  prime_memfree(void);
 
-  /* Get the primary cache.  Returns the size.  Tries to make it
-   * at least size n.  Sets sieve* to the cache, unless given 0.
+  /* Get the primary cache (mod-30 wheel sieve).
+   * Try to make sure it contains n.
+   * Returns the maximum value in the cache.
+   * Sets sieve* to the cache, unless given 0.
+   * If you get a pointer back, you MUST call release when you're done.
    *
    * Ex: just give me the current size:
    *   UV cache_size = get_prime_cache(0, 0);
diff --git a/lib/Math/Prime/Util.pm b/lib/Math/Prime/Util.pm
index 3d0bab0..e0e4300 100644
--- a/lib/Math/Prime/Util.pm
+++ b/lib/Math/Prime/Util.pm
@@ -323,11 +323,11 @@ functions inside Perl now, I recommend L<Math::Pari>.
 
 The default sieving and factoring are intended to be (and currently are)
 the fastest on CPAN, including L<Math::Prime::XS>, L<Math::Prime::FastSieve>,
-and L<Math::Factor::XS>.  L<Math::Pari> is slower in some things, faster in
-others.
+and L<Math::Factor::XS>.  It seems to be faster than L<Math::Pari> for
+everything except factoring certain 16-20 digit numbers.
 
-The module is thread-safe, though threads should not call the memory free
-routines.
+The module is thread-safe and allows concurrency between Perl threads while
+still sharing a prime cache.  It is not itself multithreaded.
 
 
 =head1 FUNCTIONS
@@ -613,9 +613,9 @@ always result in the input value, though those are the only cases where
 the returned factors are not prime.
 
 The current algorithm is to use trial division for small numbers, while large
-numbers go through a sequence of small trials, SQUFOF, Pollard's Rho, and
-finally trial division for any survivors.  This process is repeated for
-each non-prime factor.
+numbers go through a sequence of small trials, SQUFOF, Pollard's Rho, Hart's
+one line factorization, and finally trial division for any survivors.  This
+process is repeated for each non-prime factor.
 
 
 =head2 all_factors
@@ -745,15 +745,15 @@ I have not completed testing all the functions near the word size limit
 (e.g. C<2^32> for 32-bit machines).  Please report any problems you find.
 
 The extra factoring algorithms are mildly interesting but really limited by
-not being big number aware.
+not being big number aware.  Assuming a desktop PC, every 32-bit number
+should be factored by the main routine in a few microseconds, and 64-bit
+numbers should be a few milliseconds at worst.
 
 Perl versions earlier than 5.8.0 have issues with 64-bit.  The test suite will
 try to determine if your Perl is broken.  This will show up in factoring tests.
 Perl 5.6.2 32-bit works fine, as do later versions with 32-bit and 64-bit.
 
-The module is thread-safe and should allow good concurrency.  There are still
-some issues if threads call prime_memfree while other threads are sieving
-that are being worked on.
+The module is thread-safe and should allow good concurrency.
 
 
 =head1 PERFORMANCE
@@ -826,6 +826,7 @@ The differences are in the implementations:
      deterministic set of Miller-Rabin tests for large numbers.
 
 
+
 Factoring performance depends on the input, and the algorithm choices used
 are still being tuned.  Compared to Math::Factor::XS, it is a tiny bit faster
 for most input under 10M or so, and rapidly gets faster.  For numbers
@@ -849,6 +850,7 @@ beyond that is the Generalized Number Field Sieve.  For serious factoring,
 I recommend looking info C<yafu>, C<msieve>, C<Pari>, and C<GGNFS>.
 
 
+
 =head1 AUTHORS
 
 Dana Jacobsen E<lt>dana at acm.orgE<gt>
diff --git a/t/31-threading.t b/t/31-threading.t
index e73dcd2..8959276 100644
--- a/t/31-threading.t
+++ b/t/31-threading.t
@@ -15,24 +15,66 @@ BEGIN {
   }
 }
 
-use Test::More 'tests' => 1;
+use Test::More 'tests' => 9;
 use Math::Prime::Util ":all";
-my $nthreads = 64;
 
 srand(50);
-my @digits;
-push @digits, random_ndigit_prime(6) for (0..9);
-
-my $tsub = sub { my $sum = 0;  $sum += prime_count($_) for (@digits); return $sum;};
-my @threads;
-# Fire off all our threads
-push @threads, threads->create($tsub) for (1..$nthreads);
-# Retrieve results
-my $par_sum = 0;
-$par_sum += $_->join() for (@threads);
-
-# Now try it on main
-my $seq_sum = 0;
-$seq_sum += $tsub->() for (1..$nthreads);
-
-is($par_sum, $seq_sum, "$nthreads threads summed prime count");
+my @randn;
+push @randn, rand(100000) for (0..37);
+
+thread_test(
+  sub { my $sum = 0;  $sum += prime_count($_) for (@randn); return $sum;},
+  8, "sum prime_count");
+
+thread_test(
+  sub { my $sum = 0;  for (@randn) {$sum += prime_count($_); prime_memfree; } return $sum;},
+  8, "sum prime_count with overlapping memfree calls");
+
+thread_test(
+  sub { my $sum = 0; for my $d (@randn) { for my $f (factor($d)) { $sum += $f; } } return $sum; },
+  8, "factor");
+
+thread_test(
+  sub { my $sum = 0;  $sum += nth_prime($_) for (@randn); return $sum;},
+  8, "nth_prime");
+
+thread_test(
+  sub { my $sum = 0;  $sum += next_prime($_) for (@randn); return $sum;},
+  8, "next_prime");
+
+thread_test(
+  sub { my $sum = 0;  $sum += prev_prime($_) for (@randn); return $sum;},
+  8, "prev_prime");
+
+thread_test(
+  sub { my $sum = 0;  $sum += is_prime($_) for (@randn); return $sum;},
+  8, "is_prime");
+
+thread_test(
+  sub { my $sum = 0;  for (@randn) { srand($_); $sum += random_ndigit_prime(6); } return $sum;},
+  8, "random 7-digit prime");
+
+thread_test(
+  sub { my $sum = 0;  $sum += int(RiemannR($_)) for (@randn); return $sum;},
+  8, "RiemannR");
+
+sub thread_test {
+  my $tsub = shift;
+  my $nthreads = shift;
+  my $text = shift;
+
+  my @threads;
+  # Fire off all our threads
+  push @threads, threads->create($tsub) for (1..$nthreads);
+  # Get results
+  my $par_sum = 0;
+  $par_sum += $_->join() for (@threads);
+  prime_memfree;
+
+  # Now do the same operation sequentially
+  my $seq_sum = 0;
+  $seq_sum += $tsub->() for (1..$nthreads);
+  prime_memfree;
+
+  is($par_sum, $seq_sum, "$nthreads threads $text");
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git