[libmath-prime-util-perl] 12/181: LMO phi sieve is UV-word based instead of 32-bit-word bases

Partha P. Mukherjee ppm-guest at moszumanska.debian.org
Thu May 21 18:51:01 UTC 2015


This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.36
in repository libmath-prime-util-perl.

commit 36629539198a97ac4f2c973f49ef2b0ad2b46d66
Author: Dana Jacobsen <dana at acm.org>
Date:   Fri Dec 13 15:42:02 2013 -0800

    LMO phi sieve is UV-word based instead of 32-bit-word bases
---
 lmo.c | 121 +++++++++++++++++++++++++++++++++---------------------------------
 1 file changed, 61 insertions(+), 60 deletions(-)

diff --git a/lmo.c b/lmo.c
index 619ba7f..fc85ca5 100644
--- a/lmo.c
+++ b/lmo.c
@@ -77,6 +77,26 @@
   typedef uint32_t       uint32;
 #endif
 
+typedef  UV  sword_t;
+#define SWORD_BITS  BITS_PER_WORD
+#define SWORD_ONES  UV_MAX
+#define SWORD_MASKBIT(bits)  (UVCONST(1) << ((bits) % SWORD_BITS))
+#define SWORD_CLEAR(s,bits)  s[bits/SWORD_BITS] &= ~SWORD_MASKBIT(bits)
+
+#if defined(__GNUC__) && defined(__SSE4_2__)
+static sword_t bitcount(sword_t b) {
+  sword_t ret;
+  __asm__("popcnt %1, %0" : "=r" (ret) : "r" (b));
+  return ret;
+}
+#elif SWORD_BITS == 64
+static sword_t bitcount(sword_t b) {
+  b -= (b >> 1) & 0x5555555555555555;
+  b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
+  b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f;
+  return (b * 0x0101010101010101)>>56;
+}
+#else   /* Faster than __builtin_popcount */
 static const unsigned char byte_ones[256] =
   {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
@@ -86,24 +106,12 @@ static const unsigned char byte_ones[256] =
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};
-
-static uint32_t bitcount(uint32_t b) {
-  /* The simple table method is faster than __builtin_popcount or
-   * using a 11-bit word table.  It is slower than the Nahalem asm. */
-#if 1
-  return byte_ones[b&0xFF] + byte_ones[(b>>8)&0xFF] + byte_ones[(b>>16)&0xFF] + byte_ones[b>>24];
-#else
-  uint32_t ret;
-  __asm__("popcnt %1, %0" : "=r" (ret) : "r" (b));
-  return ret;
-#endif
+static sword_t bitcount(sword_t b) {
+  return byte_ones[(b    )&0xFF] + byte_ones[(b>> 8)&0xFF]
+       + byte_ones[(b>>16)&0xFF] + byte_ones[(b>>24)     ];
 }
+#endif
 
-/* static uint32 count_one_bits(const unsigned char* m, uint32 nbytes) {
-  uint32 count = 0;
-  while (nbytes--)  count += byte_ones[*m++];
-  return count;
-} */
 
 /* Create array of small primes: 0,2,3,5,...,prev_prime(n+1) */
 static uint32_t* make_primelist(uint32 n, uint32* number_of_primes)
@@ -242,7 +250,7 @@ static UV mapes(UV x, uint32 a)
 }
 
 typedef struct {
-  uint32_t *sieve;                 /* segment bit mask */
+  sword_t  *sieve;                 /* segment bit mask */
   uint8    *word_count;            /* bit count in each 64-bit word */
   uint32   *word_count_sum;        /* cumulative sum of word_count */
   UV       *totals;                /* total bit count for all phis at index */
@@ -257,22 +265,15 @@ typedef struct {
   uint32    last_prime_to_remove;  /* index of last prime p, p^2 in segment */
 } sieve_t;
 
-/* Size of phi sieve in 32-bit words.  Multiple of 2*2*3*5*7*11 bytes. */
+/* Size of phi sieve in words.  Multiple of 3*5*7*11 words. */
 #define PHI_SIEVE_WORDS (1155 * PHI_SIEVE_MULT)
 
 /* Bit counting using cumulative sums.  A bit slower than using a running sum,
  * but a little simpler and can be run in parallel. */
-static void make_sieve_counts(const uint32_t* sieve, uint32 sieve_size, uint8* sieve_word_count) {
-  uint32 lwords = (sieve_size + 127) / 128;
-  while (lwords--) {
-    *sieve_word_count++ = bitcount(sieve[0]) + bitcount(sieve[1]);
-    sieve += 2;
-  }
-}
 static uint32 make_sieve_sums(uint32 sieve_size, const uint8* sieve_word_count, uint32* sieve_word_count_sum) {
-  uint32 i, bc, lwords = (sieve_size + 127) / 128;
+  uint32 i, bc, words = (sieve_size + 2*SWORD_BITS-1) / (2*SWORD_BITS);
   sieve_word_count_sum[0] = 0;
-  for (i = 0, bc = 0; i+7 < lwords; i += 8) {
+  for (i = 0, bc = 0; i+7 < words; i += 8) {
     const uint8* cntptr = sieve_word_count + i;
     uint32* sumptr = sieve_word_count_sum + i;
     sumptr[1] = bc += cntptr[0];
@@ -284,17 +285,16 @@ static uint32 make_sieve_sums(uint32 sieve_size, const uint8* sieve_word_count,
     sumptr[7] = bc += cntptr[6];
     sumptr[8] = bc += cntptr[7];
   }
-  for (; i < lwords; i++)
+  for (; i < words; i++)
     sieve_word_count_sum[i+1] = sieve_word_count_sum[i] + sieve_word_count[i];
-  return sieve_word_count_sum[lwords];
+  return sieve_word_count_sum[words];
 }
 
-static UV _sieve_phi(UV segment_x, const uint32_t* sieve, const uint32* sieve_word_count_sum) {
+static UV _sieve_phi(UV segment_x, const sword_t* sieve, const uint32* sieve_word_count_sum) {
   uint32 bits = (segment_x + 1) / 2;
-  uint32 words = bits / 32;
-  uint32 sieve_sum = sieve_word_count_sum[words/2];
-  if (words & 1) sieve_sum += bitcount( sieve[words-1] );
-  sieve_sum += bitcount( sieve[words] & ~(0xfffffffful << (bits % 32)) );
+  uint32 words = bits / SWORD_BITS;
+  uint32 sieve_sum = sieve_word_count_sum[words];
+  sieve_sum += bitcount( sieve[words] & ~(SWORD_ONES << (bits % SWORD_BITS)) );
   return sieve_sum;
 }
 
@@ -303,11 +303,11 @@ static UV _sieve_phi(UV segment_x, const uint32_t* sieve, const uint32* sieve_wo
  * clever, and does the job. */
 
 #define sieve_zero(sieve, si, wordcount) \
-  { uint32 index = si/32; \
-    uint32 mask = 1UL << (si % 32); \
+  { uint32  index = si/SWORD_BITS; \
+    sword_t mask  = SWORD_MASKBIT(si); \
     if (sieve[index] & mask) { \
       sieve[index] &= ~mask; \
-      wordcount[index/2]--; \
+      wordcount[index]--; \
     }  }
 
 #define sieve_case_zero(casenum, skip, si, p, size, mult, sieve, wordcount) \
@@ -319,7 +319,7 @@ static UV _sieve_phi(UV segment_x, const uint32_t* sieve, const uint32* sieve_wo
 static void remove_primes(uint32 index, uint32 last_index, sieve_t* s, const uint32_t* primes)
 {
   uint32    size = (s->size + 1) / 2;
-  uint32_t *sieve = s->sieve;
+  sword_t  *sieve = s->sieve;
   uint8    *word_count = s->word_count;
 
   s->phi_total = s->totals[last_index];
@@ -353,10 +353,10 @@ static void remove_primes(uint32 index, uint32 last_index, sieve_t* s, const uin
   s->totals[last_index] += make_sieve_sums(s->size, s->word_count, s->word_count_sum);
 }
 
-static void word_tile (uint32_t* source, uint32 from, uint32 to) {
+static void word_tile (sword_t* source, uint32 from, uint32 to) {
   while (from < to) {
     uint32 words = (2*from > to) ? to-from : from;
-    memcpy(source+from, source, 4*words);
+    memcpy(source+from, source, sizeof(sword_t)*words);
     from += words;
   }
 }
@@ -364,7 +364,7 @@ static void word_tile (uint32_t* source, uint32 from, uint32 to) {
 static void init_segment(sieve_t* s, UV segment_start, uint32 size, uint32 start_prime_index, uint32 sieve_last, const uint32_t* primes)
 {
   uint32    i, words;
-  uint32_t* sieve = s->sieve;
+  sword_t*  sieve = s->sieve;
   uint8*    word_count = s->word_count;
 
   s->start = segment_start;
@@ -391,36 +391,37 @@ static void init_segment(sieve_t* s, UV segment_start, uint32 size, uint32 start
     s->multiplier[s->last_prime_to_remove] = (p % 30) * 8 / 30;
   }
 
-  memset(sieve, 0xFF, 3*4);        /* Set first 3 words to all 1 bits */
+  memset(sieve, 0xFF, 3*sizeof(sword_t));  /* Set first 3 words to all 1 bits */
   if (start_prime_index >= 3)      /* Remove multiples of 3. */
-    for (i = 3/2; i < 3 * 32; i += 3)
-      sieve[i / 32] &= ~(1ul << (i % 32));
+    for (i = 3/2; i < 3 * SWORD_BITS; i += 3)
+      SWORD_CLEAR(sieve, i);
 
   word_tile(sieve, 3, 15);         /* Copy to first 15 = 3*5 words */
   if (start_prime_index >= 3)      /* Remove multiples of 5. */
-    for (i = 5/2; i < 15 * 32; i += 5)
-      sieve[i / 32] &= ~(1ul << (i % 32));
+    for (i = 5/2; i < 15 * SWORD_BITS; i += 5)
+      SWORD_CLEAR(sieve, i);
 
   word_tile(sieve, 15, 105);       /* Copy to first 105 = 3*5*7 words */
   if (start_prime_index >= 4)      /* Remove multiples of 7. */
-    for (i = 7/2; i < 105 * 32; i += 7)
-      sieve[i / 32] &= ~(1ul << (i % 32));
+    for (i = 7/2; i < 105 * SWORD_BITS; i += 7)
+      SWORD_CLEAR(sieve, i);
 
   word_tile(sieve, 105, 1155);     /* Copy to first 1155 = 3*5*7*11 words */
   if (start_prime_index >= 5)      /* Remove multiples of 11. */
-    for (i = 11/2; i < 1155 * 32; i += 11)
-      sieve[i / 32] &= ~(1ul << (i % 32));
+    for (i = 11/2; i < 1155 * SWORD_BITS; i += 11)
+      SWORD_CLEAR(sieve, i);
 
   size = (size+1) / 2;             /* size to odds */
-  words = (size + 31) / 32;        /* sieve size in 32-bit words */
+  words = (size + SWORD_BITS-1) / SWORD_BITS;   /* sieve size in words */
   word_tile(sieve, 1155, words);   /* Copy first 1155 words to rest */
   /* Zero all unused bits and words */
-  if (size % 32)
-    sieve[words-1] &= ~ (0xffffffffUL << (size % 32));
-  memset(sieve + words, 0x00, 4*(PHI_SIEVE_WORDS+2-words));
+  if (size % SWORD_BITS)
+    sieve[words-1] &= ~(SWORD_ONES << (size % SWORD_BITS));
+  memset(sieve + words, 0x00, sizeof(sword_t)*(PHI_SIEVE_WORDS+2 - words));
 
   /* Create counts, remove primes (updating counts and sums). */
-  make_sieve_counts(sieve, s->size, word_count);
+  for (i = 0; i < words; i++)
+    word_count[i] = bitcount(sieve[i]);
   remove_primes(6, start_prime_index, s, primes);
 }
 
@@ -467,9 +468,9 @@ UV _XS_LMO_pi(UV n)
     croak("Allocation failure in LMO Pi\n");
 
   /* Create other arrays */
-  New(0, ss.sieve,           PHI_SIEVE_WORDS   + 2, uint32_t);
-  New(0, ss.word_count,      PHI_SIEVE_WORDS/2 + 2, uint8);
-  New(0, ss.word_count_sum,  PHI_SIEVE_WORDS/2 + 2, uint32);
+  New(0, ss.sieve,           PHI_SIEVE_WORDS   + 2, sword_t);
+  New(0, ss.word_count,      PHI_SIEVE_WORDS   + 2, uint8);
+  New(0, ss.word_count_sum,  PHI_SIEVE_WORDS   + 2, uint32);
   New(0, ss.totals,          K3+2, UV);
   New(0, ss.prime_index,     K3+2, uint32);
   New(0, ss.first_bit_index, K3+2, uint32);
@@ -552,8 +553,8 @@ UV _XS_LMO_pi(UV n)
 
   for (sieve_start = 0; sieve_start < last_phi_sieve; sieve_start = sieve_end) {
     /* This phi segment goes from sieve_start to sieve_end. */
-    sieve_end = ((sieve_start + 64*PHI_SIEVE_WORDS) <  last_phi_sieve)
-              ?   sieve_start + 64*PHI_SIEVE_WORDS  :  last_phi_sieve;
+    sieve_end = ((sieve_start + 2*SWORD_BITS*PHI_SIEVE_WORDS) <  last_phi_sieve)
+              ?   sieve_start + 2*SWORD_BITS*PHI_SIEVE_WORDS  :  last_phi_sieve;
     /* Only divisors s.t. sieve_start <= N / divisor < sieve_end considered. */
     least_divisor = n / sieve_end;
     /* Initialize the sieve segment and all associated variables. */

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git



More information about the Pkg-perl-cvs-commits mailing list