[Pkg-ceph-commits] [ceph] 12/17: Imported Upstream version 10.1.0

Thu Mar 31 10:53:25 UTC 2016

This is an automated email from the git hooks/post-receive script.

jamespage pushed a commit to branch ubuntu-xenial
in repository ceph.

commit 8d207abff415db4e03a4771baf7b29c5ffd7c797
Author: James Page <james.page at ubuntu.com>
Date:   Wed Mar 30 09:44:46 2016 +0100

    Imported Upstream version 10.1.0
---
 .../jerasure/gf-complete/include/gf_complete.h     |  204 ++
 .../jerasure/gf-complete/include/gf_general.h      |   61 +
 .../jerasure/gf-complete/include/gf_int.h          |  200 ++
 .../jerasure/gf-complete/include/gf_method.h       |   20 +
 .../jerasure/gf-complete/include/gf_rand.h         |   22 +
 .../jerasure/gf-complete/include/gf_w16.h          |   66 +
 .../jerasure/gf-complete/include/gf_w32.h          |   71 +
 .../jerasure/gf-complete/include/gf_w4.h           |   63 +
 .../jerasure/gf-complete/include/gf_w64.h          |   50 +
 .../jerasure/gf-complete/include/gf_w8.h           |   99 +
 src/erasure-code/jerasure/gf-complete/src/gf.c     | 1076 ++++++++
 .../jerasure/gf-complete/src/gf_general.c          |  539 ++++
 .../jerasure/gf-complete/src/gf_method.c           |  193 ++
 .../jerasure/gf-complete/src/gf_rand.c             |   80 +
 .../jerasure/gf-complete/src/gf_w128.c             | 1783 +++++++++++++
 src/erasure-code/jerasure/gf-complete/src/gf_w16.c | 2452 +++++++++++++++++
 src/erasure-code/jerasure/gf-complete/src/gf_w32.c | 2823 ++++++++++++++++++++
 src/erasure-code/jerasure/gf-complete/src/gf_w4.c  | 2051 ++++++++++++++
 src/erasure-code/jerasure/gf-complete/src/gf_w64.c | 2218 +++++++++++++++
 src/erasure-code/jerasure/gf-complete/src/gf_w8.c  | 2392 +++++++++++++++++
 .../jerasure/gf-complete/src/gf_wgen.c             | 1019 +++++++
 .../jerasure/gf-complete/src/neon/gf_w16_neon.c    |  356 +++
 .../jerasure/gf-complete/src/neon/gf_w32_neon.c    |  269 ++
 .../jerasure/gf-complete/src/neon/gf_w4_neon.c     |  247 ++
 .../jerasure/gf-complete/src/neon/gf_w64_neon.c    |  333 +++
 .../jerasure/gf-complete/src/neon/gf_w8_neon.c     |  302 +++
 .../jerasure/jerasure/include/cauchy.h             |   45 +
 .../jerasure/jerasure/include/galois.h             |  100 +
 .../jerasure/jerasure/include/jerasure.h           |  294 ++
 .../jerasure/jerasure/include/liberation.h         |   47 +
 .../jerasure/jerasure/include/reed_sol.h           |   50 +
 src/erasure-code/jerasure/jerasure/src/cauchy.c    |  405 +++
 src/erasure-code/jerasure/jerasure/src/galois.c    |  365 +++
 src/erasure-code/jerasure/jerasure/src/jerasure.c  | 1388 ++++++++++
 .../jerasure/jerasure/src/liberation.c             |  262 ++
 src/erasure-code/jerasure/jerasure/src/reed_sol.c  |  302 +++
 36 files changed, 22247 insertions(+)

diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_complete.h b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h
new file mode 100644
index 0000000..c4783e8
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h
@@ -0,0 +1,204 @@
+/* 
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_complete.h
+ *
+ * The main include file for gf_complete. 
+ */
+
+#ifndef _GF_COMPLETE_H_
+#define _GF_COMPLETE_H_
+#include <stdint.h>
+
+#ifdef INTEL_SSE4
+  #ifdef __SSE4_2__
+    #include <nmmintrin.h>
+  #endif
+  #ifdef __SSE4_1__
+    #include <smmintrin.h>
+  #endif
+#endif
+
+#ifdef INTEL_SSSE3
+  #include <tmmintrin.h>
+#endif
+
+#ifdef INTEL_SSE2
+  #include <emmintrin.h>
+#endif
+
+#ifdef INTEL_SSE4_PCLMUL
+  #include <wmmintrin.h>
+#endif
+
+#if defined(ARM_NEON)
+  #include <arm_neon.h>
+#endif
+
+
+/* These are the different ways to perform multiplication.
+   Not all are implemented for all values of w.
+   See the paper for an explanation of how they work. */
+
+typedef enum {GF_MULT_DEFAULT,
+              GF_MULT_SHIFT,
+              GF_MULT_CARRY_FREE,
+              GF_MULT_CARRY_FREE_GK,
+              GF_MULT_GROUP,
+              GF_MULT_BYTWO_p,
+              GF_MULT_BYTWO_b,
+              GF_MULT_TABLE,
+              GF_MULT_LOG_TABLE,
+              GF_MULT_LOG_ZERO,
+              GF_MULT_LOG_ZERO_EXT,
+              GF_MULT_SPLIT_TABLE,
+              GF_MULT_COMPOSITE } gf_mult_type_t;
+
+/* These are the different ways to optimize region 
+   operations.  They are bits because you can compose them.
+   Certain optimizations only apply to certain gf_mult_type_t's.  
+   Again, please see documentation for how to use these */
+   
+#define GF_REGION_DEFAULT      (0x0)
+#define GF_REGION_DOUBLE_TABLE (0x1)
+#define GF_REGION_QUAD_TABLE   (0x2)
+#define GF_REGION_LAZY         (0x4)
+#define GF_REGION_SIMD         (0x8)
+#define GF_REGION_SSE          (0x8)
+#define GF_REGION_NOSIMD       (0x10)
+#define GF_REGION_NOSSE        (0x10)
+#define GF_REGION_ALTMAP       (0x20)
+#define GF_REGION_CAUCHY       (0x40)
+
+typedef uint32_t gf_region_type_t;
+
+/* These are different ways to implement division.
+   Once again, it's best to use "DEFAULT".  However,
+   there are times when you may want to experiment
+   with the others. */
+
+typedef enum { GF_DIVIDE_DEFAULT,
+               GF_DIVIDE_MATRIX,
+               GF_DIVIDE_EUCLID } gf_division_type_t;
+
+/* We support w=4,8,16,32,64 and 128 with their own data types and
+   operations for multiplication, division, etc.  We also support
+   a "gen" type so that you can do general gf arithmetic for any 
+   value of w from 1 to 32.  You can perform a "region" operation
+   on these if you use "CAUCHY" as the mapping. 
+ */
+
+typedef uint32_t    gf_val_32_t;
+typedef uint64_t    gf_val_64_t;
+typedef uint64_t   *gf_val_128_t;
+
+extern int _gf_errno;
+extern void gf_error();
+
+typedef struct gf *GFP;
+
+typedef union gf_func_a_b {
+    gf_val_32_t  (*w32) (GFP gf, gf_val_32_t a,  gf_val_32_t b);
+    gf_val_64_t  (*w64) (GFP gf, gf_val_64_t a,  gf_val_64_t b);
+    void         (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t c);
+} gf_func_a_b;
+  
+typedef union {
+  gf_val_32_t  (*w32) (GFP gf, gf_val_32_t a);
+  gf_val_64_t  (*w64) (GFP gf, gf_val_64_t a);
+  void         (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b);
+} gf_func_a;
+  
+typedef union {
+  void  (*w32) (GFP gf, void *src, void *dest, gf_val_32_t val,  int bytes, int add);
+  void  (*w64) (GFP gf, void *src, void *dest, gf_val_64_t val,  int bytes, int add);
+  void  (*w128)(GFP gf, void *src, void *dest, gf_val_128_t val, int bytes, int add);
+} gf_region;
+
+typedef union {
+  gf_val_32_t  (*w32) (GFP gf, void *start, int bytes, int index);
+  gf_val_64_t  (*w64) (GFP gf, void *start, int bytes, int index);
+  void         (*w128)(GFP gf, void *start, int bytes, int index, gf_val_128_t rv);
+} gf_extract;
+
+typedef struct gf {
+  gf_func_a_b    multiply;
+  gf_func_a_b    divide;
+  gf_func_a      inverse;
+  gf_region      multiply_region;
+  gf_extract     extract_word;
+  void           *scratch;
+} gf_t;
+    
+/* Initializes the GF to defaults.  Pass it a pointer to a gf_t.
+   Returns 0 on failure, 1 on success. */
+
+extern int gf_init_easy(GFP gf, int w);
+
+/* Initializes the GF changing the defaults.
+   Returns 0 on failure, 1 on success.
+   Pass it a pointer to a gf_t.
+   For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t .  
+   For region_type, OR together the GF_REGION_xxx's defined above.  
+   Use 0 as prim_poly for defaults.  Otherwise, the leading 1 is optional.
+   Use NULL for scratch_memory to have init_hard allocate memory.  Otherwise,
+   use gf_scratch_size() to determine how big scratch_memory has to be.
+ */
+
+extern int gf_init_hard(GFP gf, 
+                        int w, 
+                        int mult_type, 
+                        int region_type, 
+                        int divide_type, 
+                        uint64_t prim_poly,
+                        int arg1, 
+                        int arg2,
+                        GFP base_gf,
+                        void *scratch_memory);
+
+/* Determines the size for scratch_memory.  
+   Returns 0 on failure and non-zero on success. */
+
+extern int gf_scratch_size(int w, 
+                           int mult_type, 
+                           int region_type, 
+                           int divide_type, 
+                           int arg1, 
+                           int arg2);
+
+/* This reports the gf_scratch_size of a gf_t that has already been created */
+
+extern int gf_size(GFP gf);
+
+/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc.
+   If recursive = 1, then it calls itself recursively on base_gf. */
+
+extern int gf_free(GFP gf, int recursive);
+
+/* This is support for inline single multiplications and divisions.
+   I know it's yucky, but if you've got to be fast, you've got to be fast.
+   We support inlining for w=4, w=8 and w=16.  
+
+   To use inline multiplication and division with w=4 or 8, you should use the 
+   default gf_t, or one with a single table.  Otherwise, gf_w4/8_get_mult_table()
+   will return NULL. Similarly, with w=16, the gf_t must be LOG */
+
+uint8_t *gf_w4_get_mult_table(GFP gf);
+uint8_t *gf_w4_get_div_table(GFP gf);
+
+#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)])
+
+uint8_t *gf_w8_get_mult_table(GFP gf);
+uint8_t *gf_w8_get_div_table(GFP gf);
+
+#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)])
+
+uint16_t *gf_w16_get_log_table(GFP gf);
+uint16_t *gf_w16_get_mult_alog_table(GFP gf);
+uint16_t *gf_w16_get_div_alog_table(GFP gf);
+
+#define GF_W16_INLINE_MULT(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(uint32_t)log[a]+(uint32_t)log[b]])
+#define GF_W16_INLINE_DIV(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(int)log[a]-(int)log[b]])
+#endif
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_general.h b/src/erasure-code/jerasure/gf-complete/include/gf_general.h
new file mode 100644
index 0000000..9a5de52
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_general.h
@@ -0,0 +1,61 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_general.h
+ *
+ * This file has helper routines for doing basic GF operations with any
+ * legal value of w.  The problem is that w <= 32, w=64 and w=128 all have
+ * different data types, which is a pain.  The procedures in this file try
+ * to alleviate that pain.  They are used in gf_unit and gf_time.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+
+typedef union {
+  uint32_t w32;
+  uint64_t w64;
+  uint64_t w128[2];
+} gf_general_t;
+
+void gf_general_set_zero(gf_general_t *v, int w);
+void gf_general_set_one(gf_general_t *v, int w);
+void gf_general_set_two(gf_general_t *v, int w);
+
+int gf_general_is_zero(gf_general_t *v, int w);
+int gf_general_is_one(gf_general_t *v, int w);
+int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w);
+
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex);
+int  gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex);
+
+void gf_general_set_random(gf_general_t *v, int w, int zero_ok);
+
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
+void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
+void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
+void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b);
+
+void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, 
+                                   void *ra, void *rb, 
+                                   int bytes, int xor);
+
+void gf_general_do_region_check(gf_t *gf, gf_general_t *a, 
+                                void *orig_a, void *orig_target, void *final_target, 
+                                int bytes, int xor);
+
+
+/* Which is M, D or I for multiply, divide or inverse. */
+
+void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size);
+int  gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char which);
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_int.h b/src/erasure-code/jerasure/gf-complete/include/gf_int.h
new file mode 100644
index 0000000..32866f4
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_int.h
@@ -0,0 +1,200 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_int.h
+ *
+ * Internal code for Galois field routines.  This is not meant for 
+ * users to include, but for the internal GF files to use. 
+ */
+
+#pragma once
+
+#include "gf_complete.h"
+
+#include <string.h>
+
+extern void     timer_start (double *t);
+extern double   timer_split (const double *t);
+extern void     galois_fill_random (void *buf, int len, unsigned int seed);
+
+typedef struct {
+  int mult_type;
+  int region_type;
+  int divide_type;
+  int w;
+  uint64_t prim_poly;
+  int free_me;
+  int arg1;
+  int arg2;
+  gf_t *base_gf;
+  void *private;
+} gf_internal_t;
+
+extern int gf_w4_init (gf_t *gf);
+extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w8_init (gf_t *gf);
+extern int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w16_init (gf_t *gf);
+extern int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w32_init (gf_t *gf);
+extern int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w64_init (gf_t *gf);
+extern int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w128_init (gf_t *gf);
+extern int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_wgen_init (gf_t *gf);
+extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor);
+gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index);
+
+extern void gf_alignment_error(char *s, int a);
+
+extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp);
+
+/* This returns the correct default for prim_poly when base is used as the base
+   field for COMPOSITE.  It returns 0 if we don't have a default prim_poly. */
+
+extern uint64_t gf_composite_get_default_poly(gf_t *base);
+
+/* This structure lets you define a region multiply.  It helps because you can handle
+   unaligned portions of the data with the procedures below, which really cleans
+   up the code. */
+
+typedef struct {
+  gf_t *gf;
+  void *src;
+  void *dest;
+  int bytes;
+  uint64_t val;
+  int xor;
+  int align;           /* The number of bytes to which to align. */
+  void *s_start;       /* The start and the top of the aligned region. */
+  void *d_start;
+  void *s_top;
+  void *d_top;
+} gf_region_data;
+
+/* This lets you set up one of these in one call. It also sets the start/top pointers. */
+
+void gf_set_region_data(gf_region_data *rd,
+                        gf_t *gf,
+                        void *src,
+                        void *dest,
+                        int bytes,
+                        uint64_t val,
+                        int xor,
+                        int align);
+
+/* This performs gf->multiply.32() on all of the unaligned bytes in the beginning of the region */
+
+extern void gf_do_initial_region_alignment(gf_region_data *rd);
+
+/* This performs gf->multiply.32() on all of the unaligned bytes in the end of the region */
+
+extern void gf_do_final_region_alignment(gf_region_data *rd);
+
+extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base);
+
+extern void gf_multby_zero(void *dest, int bytes, int xor);
+extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
+
+typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
+              GF_E_MDEFREG, /* Reg != Default && Mult == Default */
+              GF_E_MDEFARG, /* Args != Default && Mult == Default */
+              GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
+              GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
+              GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
+              GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */
+              GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
+              GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
+              GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
+              GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */
+              GF_E_MATRIXW, /* Div == MATRIX && w > 32 */
+              GF_E_BAD___W, /* Illegal w */
+              GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */
+              GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */
+              GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */
+              GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */
+              GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */
+              GF_E_QUAD__W, /* Reg == QUAD && w != 4 */
+              GF_E_QUAD__J, /* Reg == QUAD && other Reg */
+              GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
+              GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
+              GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */
+              GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
+              GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */
+              GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
+              GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
+              GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
+              GF_E_LOGBADW, /* Mult == LOGx, w too big*/
+              GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */
+              GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */
+              GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */
+              GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */
+              GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */
+              GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */
+              GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */
+              GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */
+              GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */
+              GF_E_GR_AR_W, /* Mult == GROUP, either arg > w  */
+              GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
+              GF_E_TABLE_W, /* Mult == TABLE, w too big */
+              GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */
+              GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
+              GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
+              GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
+              GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */
+              GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */
+              GF_E_SP128_A, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */
+              GF_E_SP128_S, /* Mult == SPLIT, w=128, SSE only with 4/128 */
+              GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128)  */
+              GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */
+              GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */
+              GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */
+              GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */
+              GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */
+              GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */
+              GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */
+              GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */
+              GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */
+              GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */
+              GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */
+              GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */
+              GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
+              GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
+              GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
+              GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */
+              GF_E_COMP__W, /* Mult == COMP, Bad w. */
+              GF_E_UNKFLAG, /* Unknown flag in create_from.... */
+              GF_E_UNKNOWN, /* Unknown mult_type. */
+              GF_E_UNK_REG, /* Unknown region_type. */
+              GF_E_UNK_DIV, /* Unknown divide_type. */
+              GF_E_CFM___W, /* Mult == CFM,  Bad w. */
+              GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_FEWARGS, /* Too few args in argc/argv. */
+              GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */
+              GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */
+              GF_E_COMPXPP, /* Can't derive a default pp for composite field. */
+              GF_E_BASE__W, /* Composite -- Base field is the wrong size. */
+              GF_E_TWOMULT, /* In create_from... two -m's. */
+              GF_E_TWO_DIV, /* In create_from... two -d's. */
+              GF_E_POLYSPC, /* Bad numbera after -p. */
+              GF_E_SPLITAR, /* Ran out of arguments in SPLIT */
+              GF_E_SPLITNU, /* Arguments not integers in SPLIT. */
+              GF_E_GROUPAR, /* Ran out of arguments in GROUP */
+              GF_E_GROUPNU, /* Arguments not integers in GROUP. */
+              GF_E_DEFAULT } gf_error_type_t;
+
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_method.h b/src/erasure-code/jerasure/gf-complete/include/gf_method.h
new file mode 100644
index 0000000..880b349
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_method.h
@@ -0,0 +1,20 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_method.h
+ *
+ * Parses argv to figure out the flags and arguments.  Creates the gf.
+ */
+
+#pragma once
+
+#include "gf_complete.h"
+
+/* Parses argv starting at "starting".  
+   
+   Returns 0 on failure.
+   On success, it returns one past the last argument it read in argv. */
+
+extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting);
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_rand.h b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h
new file mode 100644
index 0000000..24294ad
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h
@@ -0,0 +1,22 @@
+/* 
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_rand.h
+ *
+ * Random number generation, using the "Mother of All" random number generator.  */
+
+#pragma once
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* These are all pretty self-explanatory */
+uint32_t MOA_Random_32();
+uint64_t MOA_Random_64();
+void     MOA_Random_128(uint64_t *x);
+uint32_t MOA_Random_W(int w, int zero_ok);
+void MOA_Fill_Random_Region (void *reg, int size);   /* reg should be aligned to 4 bytes, but
+                                                        size can be anything. */
+void     MOA_Seed(uint32_t seed);
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w16.h b/src/erasure-code/jerasure/gf-complete/include/gf_w16.h
new file mode 100644
index 0000000..fb4c0e9
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w16.h
@@ -0,0 +1,66 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w16.h
+ *
+ * Defines and data structures for 16-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W16_H
+#define GF_COMPLETE_GF_W16_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (16)
+#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
+#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
+
+#define GF_BASE_FIELD_WIDTH (8)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+
+struct gf_w16_logtable_data {
+    uint16_t      log_tbl[GF_FIELD_SIZE];
+    uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+    uint16_t      *d_antilog;
+};
+
+struct gf_w16_zero_logtable_data {
+    int           log_tbl[GF_FIELD_SIZE];
+    uint16_t      _antilog_tbl[GF_FIELD_SIZE * 4];
+    uint16_t      *antilog_tbl;
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+};
+
+struct gf_w16_lazytable_data {
+    uint16_t      log_tbl[GF_FIELD_SIZE];
+    uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+    uint16_t      *d_antilog;
+    uint16_t      lazytable[GF_FIELD_SIZE];
+};
+
+struct gf_w16_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+struct gf_w16_split_8_8_data {
+    uint16_t      tables[3][256][256];
+};
+
+struct gf_w16_group_4_4_data {
+    uint16_t reduce[16];
+    uint16_t shift[16];
+};
+
+struct gf_w16_composite_data {
+  uint8_t *mult_table;
+};
+
+void gf_w16_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W16_H */
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w32.h b/src/erasure-code/jerasure/gf-complete/include/gf_w32.h
new file mode 100644
index 0000000..3396402
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w32.h
@@ -0,0 +1,71 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w32.h
+ *
+ * Defines and data structures for 32-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W32_H
+#define GF_COMPLETE_GF_W32_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (32)
+#define GF_FIRST_BIT (1 << 31)
+
+#define GF_BASE_FIELD_WIDTH (16)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
+
+struct gf_split_2_32_lazy_data {
+    uint32_t      tables[16][4];
+    uint32_t      last_value;
+};
+
+struct gf_w32_split_8_8_data {
+    uint32_t      tables[7][256][256];
+    uint32_t      region_tables[4][256];
+    uint32_t      last_value;
+};
+
+struct gf_w32_group_data {
+    uint32_t *reduce;
+    uint32_t *shift;
+    int      tshift;
+    uint64_t rmask;
+    uint32_t *memory;
+};
+
+struct gf_split_16_32_lazy_data {
+    uint32_t      tables[2][(1<<16)];
+    uint32_t      last_value;
+};
+
+struct gf_split_8_32_lazy_data {
+    uint32_t      tables[4][256];
+    uint32_t      last_value;
+};
+
+struct gf_split_4_32_lazy_data {
+    uint32_t      tables[8][16];
+    uint32_t      last_value;
+};
+
+struct gf_w32_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+struct gf_w32_composite_data {
+  uint16_t *log;
+  uint16_t *alog;
+};
+
+void gf_w32_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W32_H */
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w4.h b/src/erasure-code/jerasure/gf-complete/include/gf_w4.h
new file mode 100644
index 0000000..8ee94a3
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w4.h
@@ -0,0 +1,63 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w4.h
+ *
+ * Defines and data structures for 4-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W4_H
+#define GF_COMPLETE_GF_W4_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH      4
+#define GF_DOUBLE_WIDTH     (GF_FIELD_WIDTH*2)
+#define GF_FIELD_SIZE       (1 << GF_FIELD_WIDTH)
+#define GF_MULT_GROUP_SIZE       (GF_FIELD_SIZE-1)
+
+/* ------------------------------------------------------------
+   JSP: Each implementation has its own data, which is allocated
+   at one time as part of the handle. For that reason, it
+   shouldn't be hierarchical -- i.e. one should be able to
+   allocate it with one call to malloc. */
+
+struct gf_logtable_data {
+    uint8_t      log_tbl[GF_FIELD_SIZE];
+    uint8_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint8_t      *antilog_tbl_div;
+};
+
+struct gf_single_table_data {
+    uint8_t      mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_double_table_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+struct gf_quad_table_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t     mult[GF_FIELD_SIZE][(1<<16)];
+};
+
+struct gf_quad_table_lazy_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t     mult[(1 << 16)];
+};
+
+struct gf_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+// ARM NEON init functions
+int gf_w4_neon_cfm_init(gf_t *gf);
+void gf_w4_neon_single_table_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W4_H */
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w64.h b/src/erasure-code/jerasure/gf-complete/include/gf_w64.h
new file mode 100644
index 0000000..9a74a81
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w64.h
@@ -0,0 +1,50 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w64.h
+ *
+ * Defines and data structures for 64-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W64_H
+#define GF_COMPLETE_GF_W64_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (64)
+#define GF_FIRST_BIT (1ULL << 63)
+
+#define GF_BASE_FIELD_WIDTH (32)
+#define GF_BASE_FIELD_SIZE       (1ULL << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
+
+struct gf_w64_group_data {
+    uint64_t *reduce;
+    uint64_t *shift;
+    uint64_t *memory;
+};
+
+struct gf_split_4_64_lazy_data {
+    uint64_t      tables[16][16];
+    uint64_t      last_value;
+};
+
+struct gf_split_8_64_lazy_data {
+    uint64_t      tables[8][(1<<8)];
+    uint64_t      last_value;
+};
+
+struct gf_split_16_64_lazy_data {
+    uint64_t      tables[4][(1<<16)];
+    uint64_t      last_value;
+};
+
+struct gf_split_8_8_data {
+    uint64_t      tables[15][256][256];
+};
+
+void gf_w64_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W64_H */
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w8.h b/src/erasure-code/jerasure/gf-complete/include/gf_w8.h
new file mode 100644
index 0000000..938fcfd
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w8.h
@@ -0,0 +1,99 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w8.c
+ *
+ * Defines and data stuctures for 8-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W8_H
+#define GF_COMPLETE_GF_W8_H
+
+#include "gf_int.h"
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (8)
+#define GF_FIELD_SIZE       (1 << GF_FIELD_WIDTH)
+#define GF_HALF_SIZE       (1 << (GF_FIELD_WIDTH/2))
+#define GF_MULT_GROUP_SIZE       GF_FIELD_SIZE-1
+
+#define GF_BASE_FIELD_WIDTH (4)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+
+struct gf_w8_logtable_data {
+    uint8_t         log_tbl[GF_FIELD_SIZE];
+    uint8_t         antilog_tbl[GF_FIELD_SIZE * 2];
+    uint8_t         inv_tbl[GF_FIELD_SIZE];
+};
+
+struct gf_w8_logzero_table_data {
+    short           log_tbl[GF_FIELD_SIZE];  /* Make this signed, so that we can divide easily */
+    uint8_t         antilog_tbl[512+512+1];
+    uint8_t         *div_tbl;
+    uint8_t         *inv_tbl;
+};
+
+struct gf_w8_logzero_small_table_data {
+    short           log_tbl[GF_FIELD_SIZE];  /* Make this signed, so that we can divide easily */
+    uint8_t         antilog_tbl[255*3];
+    uint8_t         inv_tbl[GF_FIELD_SIZE];
+    uint8_t         *div_tbl;
+};
+
+struct gf_w8_composite_data {
+  uint8_t *mult_table;
+};
+
+/* Don't change the order of these relative to gf_w8_half_table_data */
+
+struct gf_w8_default_data {
+  uint8_t     high[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     low[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+  uint8_t     multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_w8_half_table_data {
+  uint8_t     high[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     low[GF_FIELD_SIZE][GF_HALF_SIZE];
+};
+
+struct gf_w8_single_table_data {
+  uint8_t     divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+  uint8_t     multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_w8_double_table_data {
+    uint8_t         div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t        mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+
+struct gf_w8_double_table_lazy_data {
+    uint8_t         div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t         smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t        mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+
+struct gf_w4_logtable_data {
+    uint8_t         log_tbl[GF_BASE_FIELD_SIZE];
+    uint8_t         antilog_tbl[GF_BASE_FIELD_SIZE * 2];
+    uint8_t         *antilog_tbl_div;
+};
+
+struct gf_w4_single_table_data {
+    uint8_t         div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
+    uint8_t         mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
+};
+
+struct gf_w8_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+int gf_w8_neon_cfm_init(gf_t *gf);
+void gf_w8_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W8_H */
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf.c b/src/erasure-code/jerasure/gf-complete/src/gf.c
new file mode 100644
index 0000000..835fb12
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf.c
@@ -0,0 +1,1076 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf.c
+ *
+ * Generic routines for Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+int _gf_errno = GF_E_DEFAULT;
+
+void gf_error()
+{
+  char *s;
+
+  switch(_gf_errno) {
+    case GF_E_DEFAULT: s = "No Error."; break;
+    case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
+    case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
+    case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
+    case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
+    case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
+    case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
+    case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
+    case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
+    case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
+    case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
+    case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
+    case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
+    case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
+    case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
+    case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
+    case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
+    case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
+    case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
+    case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
+    case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
+    case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
+    case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
+    case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
+    case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
+    case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
+    case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
+    case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
+    case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
+    case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
+    case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
+    case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
+    case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
+    case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
+    case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
+    case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
+    case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
+    case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
+    case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
+    case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
+    case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
+    case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
+    case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
+    case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
+    case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
+    case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
+    case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
+    case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
+    case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
+    case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
+    case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
+    case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
+    case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
+    case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
+    case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
+    case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
+    case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
+    case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
+    case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
+    case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
+    case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
+    case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
+    case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
+    case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
+    case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
+    case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
+    case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
+    case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
+    case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
+    case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
+    case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
+    case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
+    case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
+    case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
+    case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
+    case GF_E_UNK_REG: s = "Unknown region type."; break;
+    case GF_E_UNK_DIV: s = "Unknown division type."; break;
+    default: s = "Undefined error.";
+  }
+
+  fprintf(stderr, "%s\n", s);
+}
+
+uint64_t gf_composite_get_default_poly(gf_t *base) 
+{
+  gf_internal_t *h;
+  uint64_t rv;
+
+  h = (gf_internal_t *) base->scratch;
+  if (h->w == 4) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x13) return 2;
+    return 0;
+  } 
+  if (h->w == 8) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x11d) return 3;
+    return 0;
+  }
+  if (h->w == 16) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x105;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1100b) return 2;
+      if (h->prim_poly == 0x1002d) return 7;
+      return 0;
+    }
+  }
+  if (h->w == 32) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 2) return 0x10005;
+      if (rv == 7) return 0x10008;
+      if (rv == 0x105) return 0x10002;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x400007) return 2;
+      if (h->prim_poly == 0xc5) return 3;
+      return 0;
+    }
+  }
+  if (h->w == 64) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x100000009ULL;
+      if (rv == 2) return 0x100000004ULL;
+      if (rv == 0x10005) return 0x100000003ULL;
+      if (rv == 0x10002) return 0x100000005ULL;
+      if (rv == 0x10008) return 0x100000006ULL;  /* JSP: (0x0x100000003 works too, 
+                                                    but I want to differentiate cases). */
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1bULL) return 2;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+int gf_error_check(int w, int mult_type, int region_type, int divide_type,
+                   int arg1, int arg2, uint64_t poly, gf_t *base)
+{
+  int sse3 = 0;
+  int sse2 = 0;
+  int pclmul = 0;
+  int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
+  gf_internal_t *sub;
+
+  rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
+  rquad   = (region_type & GF_REGION_QUAD_TABLE);
+  rlazy   = (region_type & GF_REGION_LAZY);
+  rsimd   = (region_type & GF_REGION_SIMD);
+  rnosimd = (region_type & GF_REGION_NOSIMD);
+  raltmap = (region_type & GF_REGION_ALTMAP);
+  rcauchy = (region_type & GF_REGION_CAUCHY);
+
+  if (divide_type != GF_DIVIDE_DEFAULT &&
+      divide_type != GF_DIVIDE_MATRIX && 
+      divide_type != GF_DIVIDE_EUCLID) {
+    _gf_errno = GF_E_UNK_DIV;
+    return 0;
+  }
+
+  tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
+          GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
+          GF_REGION_CAUCHY );
+  if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
+
+#ifdef INTEL_SSE2
+  sse2 = 1;
+#endif
+
+#ifdef INTEL_SSSE3
+  sse3 = 1;
+#endif
+
+#ifdef INTEL_SSE4_PCLMUL
+  pclmul = 1;
+#endif
+
+#ifdef ARM_NEON
+  pclmul = 1;
+  sse3 = 1;
+#endif
+
+
+  if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
+    
+  if (mult_type != GF_MULT_COMPOSITE && w < 64) {
+    if ((poly >> (w+1)) != 0)                   { _gf_errno = GF_E_BADPOLY; return 0; }
+  }
+
+  if (mult_type == GF_MULT_DEFAULT) {
+    if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
+    if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
+    if (arg1 != 0 || arg2 != 0)           { _gf_errno = GF_E_MDEFARG; return 0; }
+    return 1;
+  }
+  
+  if (rsimd && rnosimd)                              { _gf_errno = GF_E_SIMD_NO; return 0; }
+  if (rcauchy && w > 32)                             { _gf_errno = GF_E_CAUGT32; return 0; }
+  if (rcauchy && region_type != GF_REGION_CAUCHY)    { _gf_errno = GF_E_CAUCHYB; return 0; }
+  if (rcauchy && mult_type == GF_MULT_COMPOSITE)     { _gf_errno = GF_E_CAUCOMP; return 0; }
+
+  if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && 
+      mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG1SET;
+    return 0;
+  }
+
+  if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG2SET;
+    return 0;
+  }
+
+  if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
+
+  if (rdouble) {
+    if (rquad)                      { _gf_errno = GF_E_DOUQUAD; return 0; }
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
+    if (w != 4 && w != 8)           { _gf_errno = GF_E_DOUBLEW; return 0; }
+    if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
+    if (rlazy && w == 4)            { _gf_errno = GF_E_DOUBLEL; return 0; }
+    return 1;
+  }
+
+  if (rquad) {
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
+    if (w != 4)                     { _gf_errno = GF_E_QUAD__W; return 0; }
+    if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
+    return 1;
+  }
+
+  if (rlazy)                        { _gf_errno = GF_E_LAZY__X; return 0; }
+
+  if (mult_type == GF_MULT_SHIFT) {
+    if (raltmap)                    { _gf_errno = GF_E_ALTSHIF; return 0; }
+    if (rsimd || rnosimd)           { _gf_errno = GF_E_SSESHIF; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_CARRY_FREE) {
+    if (w != 4 && w != 8 && w != 16 &&
+        w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
+    if (w == 4 && (poly & 0xc))                    { _gf_errno = GF_E_CFM4POL; return 0; }
+    if (w == 8 && (poly & 0x80))                   { _gf_errno = GF_E_CFM8POL; return 0; }
+    if (w == 16 && (poly & 0xe000))                { _gf_errno = GF_E_CF16POL; return 0; }
+    if (w == 32 && (poly & 0xfe000000))            { _gf_errno = GF_E_CF32POL; return 0; }
+    if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
+    if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
+    if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_CARRY_FREE_GK) {
+    if (w != 4 && w != 8 && w != 16 &&
+        w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
+    if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
+    if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
+    if (raltmap)                    { _gf_errno = GF_E_ALT_BY2; return 0; }
+    if (rsimd && !sse2)              { _gf_errno = GF_E_BY2_SSE; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
+                                     || mult_type == GF_MULT_LOG_ZERO_EXT ) {
+    if (w > 27)                     { _gf_errno = GF_E_LOGBADW; return 0; }
+    if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
+
+    if (mult_type == GF_MULT_LOG_TABLE) return 1;
+
+    if (w != 8 && w != 16)          { _gf_errno = GF_E_ZERBADW; return 0; }
+
+    if (mult_type == GF_MULT_LOG_ZERO) return 1;
+
+    if (w != 8)                     { _gf_errno = GF_E_ZEXBADW; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_GROUP) {
+    if (arg1 <= 0 || arg2 <= 0)                 { _gf_errno = GF_E_GR_ARGX; return 0; }
+    if (w == 4 || w == 8)                       { _gf_errno = GF_E_GR_W_48; return 0; }
+    if (w == 16 && (arg1 != 4 || arg2 != 4))     { _gf_errno = GF_E_GR_W_16; return 0; }
+    if (w == 128 && (arg1 != 4 || 
+       (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
+    if (arg1 > 27 || arg2 > 27)                 { _gf_errno = GF_E_GR_A_27; return 0; }
+    if (arg1 > w || arg2 > w)                   { _gf_errno = GF_E_GR_AR_W; return 0; }
+    if (raltmap || rsimd || rnosimd)            { _gf_errno = GF_E_GR____J; return 0; }
+    return 1;
+  }
+  
+  if (mult_type == GF_MULT_TABLE) {
+    if (w != 16 && w >= 15)                     { _gf_errno = GF_E_TABLE_W; return 0; }
+    if (w != 4 && (rsimd || rnosimd))           { _gf_errno = GF_E_TAB_SSE; return 0; }
+    if (rsimd && !sse3)                         { _gf_errno = GF_E_TABSSE3; return 0; }
+    if (raltmap)                                { _gf_errno = GF_E_TAB_ALT; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_SPLIT_TABLE) {
+    if (arg1 > arg2) {
+      tmp = arg1;
+      arg1 = arg2;
+      arg2 = tmp;
+    }
+    if (w == 8) {
+      if (arg1 != 4 || arg2 != 8)               { _gf_errno = GF_E_SP_8_AR; return 0; }
+      if (rsimd && !sse3)                       { _gf_errno = GF_E_SP_SSE3; return 0; }
+      if (raltmap)                              { _gf_errno = GF_E_SP_8__A; return 0; }
+    } else if (w == 16) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 16)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_16_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_16_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 16) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_16AR; return 0; }
+    } else if (w == 32) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 32) ||
+          (arg1 == 16 && arg2 == 32)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_32_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_32_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 32) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_32AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_32AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_32AR; return 0; }
+    } else if (w == 64) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 64) ||
+          (arg1 == 16 && arg2 == 64)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_64_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_64_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 64) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_64AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_64AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_64AR; return 0; }
+    } else if (w == 128) {
+      if (arg1 == 8 && arg2 == 128) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP128_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP128_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 128) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP128AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP128AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP128AR; return 0; }
+    } else                                      { _gf_errno = GF_E_SPLIT_W; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_COMPOSITE) {
+    if (w != 8 && w != 16 && w != 32 
+               && w != 64 && w != 128)          { _gf_errno = GF_E_COMP__W; return 0; }
+    if (w < 128 && (poly >> (w/2)) != 0)                   { _gf_errno = GF_E_COMP_PP; return 0; }
+    if (divide_type != GF_DIVIDE_DEFAULT)       { _gf_errno = GF_E_DIVCOMP; return 0; }
+    if (arg1 != 2)                              { _gf_errno = GF_E_COMP_A2; return 0; }
+    if (rsimd || rnosimd)                       { _gf_errno = GF_E_COMP_SS; return 0; }
+    if (base != NULL) {
+      sub = (gf_internal_t *) base->scratch;
+      if (sub->w != w/2)                      { _gf_errno = GF_E_BASE__W; return 0; }
+      if (poly == 0) {
+        if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
+      }
+    }
+    return 1;
+  }
+
+  _gf_errno = GF_E_UNKNOWN; 
+  return 0;
+}
+
+int gf_scratch_size(int w, 
+                    int mult_type, 
+                    int region_type, 
+                    int divide_type, 
+                    int arg1, 
+                    int arg2)
+{
+  if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
+
+  switch(w) {
+    case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+  }
+}
+
+extern int gf_size(gf_t *gf)
+{
+  gf_internal_t *h;
+  int s;
+
+  s = sizeof(gf_t);
+  h = (gf_internal_t *) gf->scratch;
+  s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
+  if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
+  return s;
+}
+
+
+int gf_init_easy(gf_t *gf, int w)
+{
+  return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 
+                      0, 0, 0, NULL, NULL);
+}
+
+/* Allen: What's going on here is this function is putting info into the
+       scratch mem of gf, and then calling the relevant REAL init
+       func for the word size.  Probably done this way to consolidate
+       those aspects of initialization that don't rely on word size,
+       and then take care of word-size-specific stuff. */
+
+int gf_init_hard(gf_t *gf, int w, int mult_type, 
+                        int region_type,
+                        int divide_type,
+                        uint64_t prim_poly,
+                        int arg1, int arg2,
+                        gf_t *base_gf,
+                        void *scratch_memory) 
+{
+  int sz;
+  gf_internal_t *h;
+ 
+  if (gf_error_check(w, mult_type, region_type, divide_type, 
+                     arg1, arg2, prim_poly, base_gf) == 0) return 0;
+
+  sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+  if (sz <= 0) return 0;  /* This shouldn't happen, as all errors should get caught
+                             in gf_error_check() */
+  
+  if (scratch_memory == NULL) {
+    h = (gf_internal_t *) malloc(sz);
+    h->free_me = 1;
+  } else {
+    h = scratch_memory;
+    h->free_me = 0;
+  }
+  gf->scratch = (void *) h;
+  h->mult_type = mult_type;
+  h->region_type = region_type;
+  h->divide_type = divide_type;
+  h->w = w;
+  h->prim_poly = prim_poly;
+  h->arg1 = arg1;
+  h->arg2 = arg2;
+  h->base_gf = base_gf;
+  h->private = (void *) gf->scratch;
+  h->private = (uint8_t *)h->private + (sizeof(gf_internal_t));
+  gf->extract_word.w32 = NULL;
+
+  switch(w) {
+    case 4: return gf_w4_init(gf);
+    case 8: return gf_w8_init(gf);
+    case 16: return gf_w16_init(gf);
+    case 32: return gf_w32_init(gf);
+    case 64: return gf_w64_init(gf);
+    case 128: return gf_w128_init(gf);
+    default: return gf_wgen_init(gf);
+  }
+}
+
+int gf_free(gf_t *gf, int recursive)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (recursive && h->base_gf != NULL) {
+    gf_free(h->base_gf, 1);
+    free(h->base_gf);
+  }
+  if (h->free_me) free(h);
+  return 0; /* Making compiler happy */
+}
+
+void gf_alignment_error(char *s, int a)
+{
+  fprintf(stderr, "Alignment error in %s:\n", s);
+  fprintf(stderr, "   The source and destination buffers must be aligned to each other,\n");
+  fprintf(stderr, "   and they must be aligned to a %d-byte address.\n", a);
+  assert(0);
+}
+
+static 
+void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
+  int cols, i, j;
+  uint32_t tmp;
+
+  cols = rows;
+
+  for (i = 0; i < rows; i++) inv[i] = (1 << i);
+
+  /* First -- convert into upper triangular */
+
+  for (i = 0; i < cols; i++) {
+
+    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the
+       matrix was not invertible */
+
+    if ((mat[i] & (1 << i)) == 0) {
+      for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
+      if (j == rows) {
+        fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
+        assert(0);
+      }
+      tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
+      tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
+    }
+
+    /* Now for each j>i, add A_ji*Ai to Aj */
+    for (j = i+1; j != rows; j++) {
+      if ((mat[j] & (1 << i)) != 0) {
+        mat[j] ^= mat[i];
+        inv[j] ^= inv[i];
+      }
+    }
+  }
+
+  /* Now the matrix is upper triangular.  Start at the top and multiply down */
+
+  for (i = rows-1; i >= 0; i--) {
+    for (j = 0; j < i; j++) {
+      if (mat[j] & (1 << i)) {
+        /*  mat[j] ^= mat[i]; */
+        inv[j] ^= inv[i];
+      }
+    }
+  }
+}
+
+uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) 
+{
+  uint32_t mat[32], inv[32], mask;
+  int i;
+
+  mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1;
+  for (i = 0; i < w; i++) {
+    mat[i] = y;
+
+    if (y & (1 << (w-1))) {
+      y = y << 1;
+      y = ((y ^ pp) & mask);
+    } else {
+      y = y << 1;
+    }
+  }
+
+  gf_invert_binary_matrix(mat, inv, w);
+  return inv[0];
+}
+
+void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
+{
+  uint64_t a, prod;
+  int xor;
+  uint64_t *s64, *d64, *top;
+
+  s64 = rd->s_start;
+  d64 = rd->d_start;
+  top = rd->d_top;
+  xor = rd->xor;
+  
+  if (xor) {
+    while (d64 != top) {
+      a = *s64;
+      prod = base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      prod ^= *d64;
+      *d64 = prod;
+      s64++;
+      d64++;
+    }
+  } else {
+    while (d64 != top) {
+      a = *s64;
+      prod = base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      *d64 = prod;
+      s64++;
+      d64++;
+    }
+  }
+}
+
+static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
+{
+  uint8_t *s8, *d8;
+  uint16_t *s16, *d16;
+  uint32_t *s32, *d32;
+  uint64_t *s64, *d64;
+  gf_internal_t *h;
+  int wb;
+  uint32_t p, a;
+
+  h = rd->gf->scratch;
+  wb = (h->w)/8;
+  if (wb == 0) wb = 1;
+  
+  while (src < s_top) {
+    switch (h->w) {
+    case 8:
+      s8 = (uint8_t *) src;
+      d8 = (uint8_t *) dest;
+      *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : 
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s8);
+      break;
+    case 4:
+      s8 = (uint8_t *) src;
+      d8 = (uint8_t *) dest;
+      a = *s8;
+      p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
+      p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
+      if (rd->xor) p ^= *d8;
+      *d8 = p;
+      break;
+    case 16:
+      s16 = (uint16_t *) src;
+      d16 = (uint16_t *) dest;
+      *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : 
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s16);
+      break;
+    case 32:
+      s32 = (uint32_t *) src;
+      d32 = (uint32_t *) dest;
+      *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : 
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s32);
+      break;
+    case 64:
+      s64 = (uint64_t *) src;
+      d64 = (uint64_t *) dest;
+      *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : 
+                      rd->gf->multiply.w64(rd->gf, rd->val, *s64);
+      break;
+    default:
+      fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
+      exit(1);
+    }
+    src = (uint8_t *)src + wb;
+    dest = (uint8_t *)dest + wb;
+  }
+}
+
+/* JSP - The purpose of this procedure is to error check alignment,
+   and to set up the region operation so that it can best leverage
+   large words.
+
+   It stores its information in rd.
+
+   Assuming you're not doing Cauchy coding, (see below for that),
+   then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
+   should change that).
+
+   src and dest must then be aligned on ceil(w/8)-byte boundaries.
+   Moreover, bytes must be a multiple of ceil(w/8).  If the variable
+   align is equal to ceil(w/8), then we will set s_start = src,
+   d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
+   And we return -- the implementation will go ahead and do the
+   multiplication on individual words (e.g. using discrete logs).
+
+   If align is greater than ceil(w/8), then the implementation needs
+   to work on groups of "align" bytes.  For example, suppose you are
+   implementing BYTWO, without SSE. Then you will be doing the region
+   multiplication in units of 8 bytes, so align = 8. Or, suppose you
+   are doing a Quad table in GF(2^4). You will be doing the region
+   multiplication in units of 2 bytes, so align = 2. Or, suppose you
+   are doing split multiplication with SSE operations in GF(2^8).
+   Then align = 16. Worse yet, suppose you are doing split
+   multiplication with SSE operations in GF(2^16), with or without
+   ALTMAP. Then, you will be doing the multiplication on 256 bits at
+   a time.  So align = 32.
+
+   When align does not equal ceil(w/8), we split the region
+   multiplication into three parts.  We are going to make s_start be
+   the first address greater than or equal to src that is a multiple
+   of align.  s_top is going to be the largest address >= src+bytes
+   such that (s_top - s_start) is a multiple of align.  We do the
+   same with d_start and d_top.  When we say that "src and dest must
+   be aligned with respect to each other, we mean that s_start-src
+   must equal d_start-dest.
+
+   Now, the region multiplication is done in three parts -- the part
+   between src and s_start must be done using single words.
+   Similarly, the part between s_top and src+bytes must also be done
+   using single words.  The part between s_start and s_top will be
+   done in chunks of "align" bytes.
+
+   One final thing -- if align > 16, then s_start and d_start will be
+   aligned on a 16 byte boundary.  Perhaps we should have two
+   variables: align and chunksize.  Then we'd have s_start & d_start
+   aligned to "align", and have s_top-s_start be a multiple of
+   chunksize.  That may be less confusing, but it would be a big
+   change.
+
+   Finally, if align = -1, then we are doing Cauchy multiplication,
+   using only XOR's.  In this case, we're not going to care about
+   alignment because we are just doing XOR's.  Instead, the only
+   thing we care about is that bytes must be a multiple of w.
+
+   This is not to say that alignment doesn't matter in performance
+   with XOR's.  See that discussion in gf_multby_one().
+
+   After you call gf_set_region_data(), the procedure
+   gf_do_initial_region_alignment() calls gf->multiply.w32() on
+   everything between src and s_start.  The procedure
+   gf_do_final_region_alignment() calls gf->multiply.w32() on
+   everything between s_top and src+bytes.
+   */
+
+void gf_set_region_data(gf_region_data *rd,
+  gf_t *gf,
+  void *src,
+  void *dest,
+  int bytes,
+  uint64_t val,
+  int xor,
+  int align)
+{
+  gf_internal_t *h = NULL;
+  int wb;
+  uint32_t a;
+  unsigned long uls, uld;
+
+  if (gf == NULL) {  /* JSP - Can be NULL if you're just doing XOR's */
+    wb = 1;
+  } else {
+    h = gf->scratch;
+    wb = (h->w)/8;
+    if (wb == 0) wb = 1;
+  }
+  
+  rd->gf = gf;
+  rd->src = src;
+  rd->dest = dest;
+  rd->bytes = bytes;
+  rd->val = val;
+  rd->xor = xor;
+  rd->align = align;
+
+  uls = (unsigned long) src;
+  uld = (unsigned long) dest;
+
+  a = (align <= 16) ? align : 16;
+
+  if (align == -1) { /* JSP: This is cauchy.  Error check bytes, then set up the pointers
+                        so that there are no alignment regions. */
+    if (h != NULL && bytes % h->w != 0) {
+      fprintf(stderr, "Error in region multiply operation.\n");
+      fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
+      assert(0);
+    }
+  
+    rd->s_start = src;
+    rd->d_start = dest;
+    rd->s_top = (uint8_t *)src + bytes;
+    rd->d_top = (uint8_t *)src + bytes;
+    return;
+  }
+
+  if (uls % a != uld % a) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
+    fprintf(stderr, "to each other along a %d byte boundary.\n", a);
+    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
+            (unsigned long) dest);
+    assert(0);
+  }
+
+  if (uls % wb != 0) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
+    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
+            (unsigned long) dest);
+    assert(0);
+  }
+
+  if (bytes % wb != 0) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
+    assert(0);
+  }
+
+  uls %= a;
+  if (uls != 0) uls = (a-uls);
+  rd->s_start = (uint8_t *)rd->src + uls;
+  rd->d_start = (uint8_t *)rd->dest + uls;
+  bytes -= uls;
+  bytes -= (bytes % align);
+  rd->s_top = (uint8_t *)rd->s_start + bytes;
+  rd->d_top = (uint8_t *)rd->d_start + bytes;
+
+}
+
+void gf_do_initial_region_alignment(gf_region_data *rd)
+{
+  gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
+}
+
+void gf_do_final_region_alignment(gf_region_data *rd)
+{
+  gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes);
+}
+
+void gf_multby_zero(void *dest, int bytes, int xor) 
+{
+  if (xor) return;
+  bzero(dest, bytes);
+  return;
+}
+
+/* JSP - gf_multby_one tries to do this in the most efficient way
+   possible.  If xor = 0, then simply call memcpy() since that
+   should be optimized by the system.  Otherwise, try to do the xor
+   in the following order:
+
+   If src and dest are aligned with respect to each other on 16-byte
+   boundaries and you have SSE instructions, then use aligned SSE
+   instructions.
+
+   If they aren't but you still have SSE instructions, use unaligned
+   SSE instructions.
+
+   If there are no SSE instructions, but they are aligned with
+   respect to each other on 8-byte boundaries, then do them with
+   uint64_t's.
+
+   Otherwise, call gf_unaligned_xor(), which does the following:
+   align a destination pointer along an 8-byte boundary, and then
+   memcpy 32 bytes at a time from the src pointer to an array of
+   doubles.  I'm not sure if that's the best -- probably needs
+   testing, but this seems like it could be a black hole.
+ */
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes);
+
+void gf_multby_one(void *src, void *dest, int bytes, int xor) 
+{
+#ifdef   INTEL_SSE2
+  __m128i ms, md;
+#endif
+  unsigned long uls, uld;
+  uint8_t *s8, *d8;
+  uint64_t *s64, *d64, *dtop64;
+  gf_region_data rd;
+
+  if (!xor) {
+    memcpy(dest, src, bytes);
+    return;
+  }
+  uls = (unsigned long) src;
+  uld = (unsigned long) dest;
+
+#ifdef   INTEL_SSE2
+  int abytes;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+  if (uls % 16 == uld % 16) {
+    gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+    while (s8 != rd.s_start) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    while (s8 < (uint8_t *) rd.s_top) {
+      ms = _mm_load_si128 ((__m128i *)(s8));
+      md = _mm_load_si128 ((__m128i *)(d8));
+      md = _mm_xor_si128(md, ms);
+      _mm_store_si128((__m128i *)(d8), md);
+      s8 += 16;
+      d8 += 16;
+    }
+    while (s8 != (uint8_t *) src + bytes) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    return;
+  }
+
+  abytes = (bytes & 0xfffffff0);
+
+  while (d8 < (uint8_t *) dest + abytes) {
+    ms = _mm_loadu_si128 ((__m128i *)(s8));
+    md = _mm_loadu_si128 ((__m128i *)(d8));
+    md = _mm_xor_si128(md, ms);
+    _mm_storeu_si128((__m128i *)(d8), md);
+    s8 += 16;
+    d8 += 16;
+  }
+  while (d8 != (uint8_t *) dest+bytes) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  return;
+#endif
+#if defined(ARM_NEON)
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  if (uls % 16 == uld % 16) {
+    gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+    while (s8 != rd.s_start) {
+      *d8 ^= *s8;
+      s8++;
+      d8++;
+    }
+    while (s8 < (uint8_t *) rd.s_top) {
+      uint8x16_t vs = vld1q_u8 (s8);
+      uint8x16_t vd = vld1q_u8 (d8);
+      uint8x16_t vr = veorq_u8 (vs, vd);
+      vst1q_u8 (d8, vr);
+      s8 += 16;
+      d8 += 16;
+    }
+  } else {
+    while (s8 + 15 < (uint8_t *) src + bytes) {
+      uint8x16_t vs = vld1q_u8 (s8);
+      uint8x16_t vd = vld1q_u8 (d8);
+      uint8x16_t vr = veorq_u8 (vs, vd);
+      vst1q_u8 (d8, vr);
+      s8 += 16;
+      d8 += 16;
+    }
+  }
+  while (s8 < (uint8_t *) src + bytes) {
+    *d8 ^= *s8;
+    s8++;
+    d8++;
+  }
+  return;
+#endif
+  if (uls % 8 != uld % 8) {
+    gf_unaligned_xor(src, dest, bytes);
+    return;
+  }
+  
+  gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+  while (d8 != rd.d_start) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  dtop64 = (uint64_t *) rd.d_top;
+
+  d64 = (uint64_t *) rd.d_start;
+  s64 = (uint64_t *) rd.s_start;
+
+  while (d64 < dtop64) {
+    *d64 ^= *s64;
+    d64++;
+    s64++;
+  }
+
+  s8 = (uint8_t *) rd.s_top;
+  d8 = (uint8_t *) rd.d_top;
+
+  while (d8 != (uint8_t *) dest+bytes) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  return;
+}
+
+#define UNALIGNED_BUFSIZE (8)
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes)
+{
+  uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
+  int i;
+  gf_region_data rd;
+  uint8_t *s8, *d8;
+
+  /* JSP - call gf_set_region_data(), but use dest in both places.  This is
+     because I only want to set up dest.  If I used src, gf_set_region_data()
+     would fail because src and dest are not aligned to each other wrt 
+     8-byte pointers.  I know this will actually align d_start to 16 bytes.
+     If I change gf_set_region_data() to split alignment & chunksize, then 
+     I could do this correctly. */
+
+  gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  while (d8 < (uint8_t *) rd.d_start) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  
+  d64 = (uint64_t *) d8;
+  while (d64 < (uint64_t *) rd.d_top) {
+    memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
+    s8 += 8*UNALIGNED_BUFSIZE;
+    for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
+      *d64 ^= scopy[i];
+      d64++;
+    }
+  }
+  
+  d8 = (uint8_t *) d64;
+  while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_general.c b/src/erasure-code/jerasure/gf-complete/src/gf_general.c
new file mode 100644
index 0000000..7f187b5
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_general.c
@@ -0,0 +1,539 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_general.c
+ *
+ * This file has helper routines for doing basic GF operations with any
+ * legal value of w.  The problem is that w <= 32, w=64 and w=128 all have
+ * different data types, which is a pain.  The procedures in this file try
+ * to alleviate that pain.  They are used in gf_unit and gf_time.
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+
+#include "gf_complete.h"
+#include "gf_int.h"
+#include "gf_method.h"
+#include "gf_rand.h"
+#include "gf_general.h"
+
+void gf_general_set_zero(gf_general_t *v, int w)
+{
+  if (w <= 32) {
+    v->w32 = 0;
+  } else if (w <= 64) {
+    v->w64 = 0;
+  } else {
+    v->w128[0] = 0;
+    v->w128[1] = 0;
+  }
+}
+
+void gf_general_set_one(gf_general_t *v, int w)
+{
+  if (w <= 32) {
+    v->w32 = 1;
+  } else if (w <= 64) {
+    v->w64 = 1;
+  } else {
+    v->w128[0] = 0;
+    v->w128[1] = 1;
+  }
+}
+
+void gf_general_set_two(gf_general_t *v, int w)
+{
+  if (w <= 32) {
+    v->w32 = 2;
+  } else if (w <= 64) {
+    v->w64 = 2;
+  } else {
+    v->w128[0] = 0;
+    v->w128[1] = 2;
+  }
+}
+
+int gf_general_is_zero(gf_general_t *v, int w) 
+{
+  if (w <= 32) {
+    return (v->w32 == 0);
+  } else if (w <= 64) {
+    return (v->w64 == 0);
+  } else {
+    return (v->w128[0] == 0 && v->w128[1] == 0);
+  }
+}
+
+int gf_general_is_one(gf_general_t *v, int w) 
+{
+  if (w <= 32) {
+    return (v->w32 == 1);
+  } else if (w <= 64) {
+    return (v->w64 == 1);
+  } else {
+    return (v->w128[0] == 0 && v->w128[1] == 1);
+  }
+}
+
+void gf_general_set_random(gf_general_t *v, int w, int zero_ok) 
+{
+  if (w <= 32) {
+      v->w32 = MOA_Random_W(w, zero_ok);
+  } else if (w <= 64) {
+    while (1) {
+      v->w64 = MOA_Random_64();
+      if (v->w64 != 0 || zero_ok) return;
+    }
+  } else {
+    while (1) {
+      MOA_Random_128(v->w128);
+      if (v->w128[0] != 0 || v->w128[1] != 0 || zero_ok) return;
+    }
+  }
+}
+
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex)
+{
+  if (w <= 32) {
+    if (hex) {
+      sprintf(s, "%x", v->w32);
+    } else {
+      sprintf(s, "%u", v->w32);
+    }
+  } else if (w <= 64) {
+    if (hex) {
+      sprintf(s, "%llx", (long long unsigned int) v->w64);
+    } else {
+      sprintf(s, "%lld", (long long unsigned int) v->w64);
+    }
+  } else {
+    if (v->w128[0] == 0) {
+      sprintf(s, "%llx", (long long unsigned int) v->w128[1]);
+    } else {
+      sprintf(s, "%llx%016llx", (long long unsigned int) v->w128[0], 
+                                (long long unsigned int) v->w128[1]);
+    }
+  }
+}
+
+int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex)
+{
+  int l;
+  int save;
+
+  if (w <= 32) {
+    if (hex) {
+      if (sscanf(s, "%x", &(v->w32)) == 0) return 0;
+    } else {
+      if (sscanf(s, "%u", &(v->w32)) == 0) return 0;
+    }
+    if (w == 32) return 1;
+    if (w == 31) {
+      if (v->w32 & (1 << 31)) return 0;
+      return 1;
+    } 
+    if (v->w32 & ~((1 << w)-1)) return 0;
+    return 1;
+  } else if (w <= 64) {
+    if (hex) return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w64))) == 1);
+    return (sscanf(s, "%lld", (long long int *) (&(v->w64))) == 1);
+  } else {
+    if (!hex) return 0;
+    l = strlen(s);
+    if (l <= 16) {
+      v->w128[0] = 0;
+      return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1);
+    } else {
+      if (l > 32) return 0;
+      save = s[l-16];
+      s[l-16] = '\0';
+      if (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[0]))) == 0) {
+        s[l-16] = save;
+        return 0;
+      }
+      return (sscanf(s+(l-16), "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1);
+    }
+  }
+}
+    
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    c->w32 = a->w32 ^ b->w32;
+  } else if (w <= 64) {
+    c->w64 = a->w64 ^ b->w64;
+  } else {
+    c->w128[0] = a->w128[0] ^ b->w128[0];
+    c->w128[1] = a->w128[1] ^ b->w128[1];
+  }
+}
+  
+void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    c->w32 = gf->multiply.w32(gf, a->w32, b->w32);
+  } else if (w <= 64) {
+    c->w64 = gf->multiply.w64(gf, a->w64, b->w64);
+  } else {
+    gf->multiply.w128(gf, a->w128, b->w128, c->w128);
+  }
+}
+  
+void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    c->w32 = gf->divide.w32(gf, a->w32, b->w32);
+  } else if (w <= 64) {
+    c->w64 = gf->divide.w64(gf, a->w64, b->w64);
+  } else {
+    gf->divide.w128(gf, a->w128, b->w128, c->w128);
+  }
+}
+  
+void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    b->w32 = gf->inverse.w32(gf, a->w32);
+  } else if (w <= 64) {
+    b->w64 = gf->inverse.w64(gf, a->w64);
+  } else {
+    gf->inverse.w128(gf, a->w128, b->w128);
+  }
+}
+  
+int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w)
+{
+  if (w <= 32) {
+    return (v1->w32 == v2->w32);
+  } else if (w <= 64) {
+    return (v1->w64 == v2->w64);
+  } else {
+    return (v1->w128[0] == v2->w128[0] &&
+            v1->w128[1] == v2->w128[1]);
+  }
+}
+
+void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, void *ra, void *rb, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    gf->multiply_region.w32(gf, ra, rb, a->w32, bytes, xor);
+  } else if (w <= 64) {
+    gf->multiply_region.w64(gf, ra, rb, a->w64, bytes, xor);
+  } else {
+    gf->multiply_region.w128(gf, ra, rb, a->w128, bytes, xor);
+  }
+}
+
+void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *orig_target, void *final_target, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int w, words, i;
+  gf_general_t oa, ot, ft, sb;
+  char sa[50], soa[50], sot[50], sft[50], ssb[50];
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  words = (bytes * 8) / w;
+  for (i = 0; i < words; i++) {
+    if (w <= 32) {
+      oa.w32 = gf->extract_word.w32(gf, orig_a, bytes, i);
+      ot.w32 = gf->extract_word.w32(gf, orig_target, bytes, i);
+      ft.w32 = gf->extract_word.w32(gf, final_target, bytes, i);
+      sb.w32 = gf->multiply.w32(gf, a->w32, oa.w32);
+      if (xor) sb.w32 ^= ot.w32;
+    } else if (w <= 64) {
+      oa.w64 = gf->extract_word.w64(gf, orig_a, bytes, i);
+      ot.w64 = gf->extract_word.w64(gf, orig_target, bytes, i);
+      ft.w64 = gf->extract_word.w64(gf, final_target, bytes, i);
+      sb.w64 = gf->multiply.w64(gf, a->w64, oa.w64);
+      if (xor) sb.w64 ^= ot.w64;
+    } else {
+      gf->extract_word.w128(gf, orig_a, bytes, i, oa.w128);
+      gf->extract_word.w128(gf, orig_target, bytes, i, ot.w128);
+      gf->extract_word.w128(gf, final_target, bytes, i, ft.w128);
+      gf->multiply.w128(gf, a->w128, oa.w128, sb.w128);
+      if (xor) {
+        sb.w128[0] ^= ot.w128[0];
+        sb.w128[1] ^= ot.w128[1];
+      }
+    }
+
+    if (!gf_general_are_equal(&ft, &sb, w)) {
+      
+      fprintf(stderr,"Problem with region multiply (all values in hex):\n");
+      fprintf(stderr,"   Target address base: 0x%lx.  Word 0x%x of 0x%x.  Xor: %d\n", 
+                 (unsigned long) final_target, i, words, xor);
+      gf_general_val_to_s(a, w, sa, 1);
+      gf_general_val_to_s(&oa, w, soa, 1);
+      gf_general_val_to_s(&ot, w, sot, 1);
+      gf_general_val_to_s(&ft, w, sft, 1);
+      gf_general_val_to_s(&sb, w, ssb, 1);
+      fprintf(stderr,"   Value: %s\n", sa);
+      fprintf(stderr,"   Original source word: %s\n", soa);
+      if (xor) fprintf(stderr,"   XOR with target word: %s\n", sot);
+      fprintf(stderr,"   Product word: %s\n", sft);
+      fprintf(stderr,"   It should be: %s\n", ssb);
+      assert(0);
+    }
+  }
+}
+
+void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
+{
+  void *top;
+  gf_general_t g;
+  uint8_t *r8, *r8a;
+  uint16_t *r16;
+  uint32_t *r32;
+  uint64_t *r64;
+  int i;
+
+  top = (uint8_t *)rb+size;
+
+  /* If w is 8, 16, 32, 64 or 128, fill the regions with random bytes.
+     However, don't allow for zeros in rb, because that will screw up
+     division.
+     
+     When w is 4, you fill the regions with random 4-bit words in each byte.
+
+     Otherwise, treat every four bytes as an uint32_t
+     and fill it with a random value mod (1 << w).
+   */
+
+  if (w == 8 || w == 16 || w == 32 || w == 64 || w == 128) {
+    MOA_Fill_Random_Region (ra, size);
+    while (rb < top) {
+      gf_general_set_random(&g, w, 0);
+      switch (w) {
+        case 8: 
+          r8 = (uint8_t *) rb;
+          *r8 = g.w32;
+          break;
+        case 16: 
+          r16 = (uint16_t *) rb;
+          *r16 = g.w32;
+          break;
+        case 32: 
+          r32 = (uint32_t *) rb;
+          *r32 = g.w32;
+          break;
+        case 64:
+          r64 = (uint64_t *) rb;
+          *r64 = g.w64;
+          break;
+        case 128: 
+          r64 = (uint64_t *) rb;
+          r64[0] = g.w128[0];
+          r64[1] = g.w128[1];
+          break;
+      }
+      rb = (uint8_t *)rb + (w/8);
+    }
+  } else if (w == 4) {
+    r8a = (uint8_t *) ra;
+    r8 = (uint8_t *) rb;
+    while (r8 < (uint8_t *) top) {
+      gf_general_set_random(&g, w, 1);
+      *r8a = g.w32;
+      gf_general_set_random(&g, w, 0);
+      *r8 = g.w32;
+      r8a++;
+      r8++;
+    }
+  } else {
+    r32 = (uint32_t *) ra;
+    for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1);
+    r32 = (uint32_t *) rb;
+    for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 0);
+  }
+}
+
+/* This sucks, but in order to time, you really need to avoid putting ifs in 
+   the inner loops.  So, I'm doing a separate timing test for each w: 
+   (4 & 8), 16, 32, 64, 128 and everything else.  Fortunately, the "everything else"
+   tests can be equivalent to w=32.
+
+   I'm also putting the results back into ra, because otherwise, the optimizer might
+   figure out that we're not really doing anything in the inner loops and it 
+   will chuck that. */
+
+int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char test)
+{
+  gf_internal_t *h;
+  void *top;
+  uint8_t *r8a, *r8b, *top8;
+  uint16_t *r16a, *r16b, *top16;
+  uint32_t *r32a, *r32b, *top32;
+  uint64_t *r64a, *r64b, *top64, *r64c;
+  int w, rv;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  top = (uint8_t *)ra + size;
+
+  if (w == 8 || w == 4) {
+    r8a = (uint8_t *) ra; 
+    r8b = (uint8_t *) rb; 
+    top8 = (uint8_t *) top;
+    if (test == 'M') {
+      while (r8a < top8) {
+        *r8a = gf->multiply.w32(gf, *r8a, *r8b);
+        r8a++;
+        r8b++;
+      }
+    } else if (test == 'D') {
+      while (r8a < top8) {
+        *r8a = gf->divide.w32(gf, *r8a, *r8b);
+        r8a++;
+        r8b++;
+      }
+    } else if (test == 'I') {
+      while (r8a < top8) {
+        *r8a = gf->inverse.w32(gf, *r8a);
+        r8a++;
+      }
+    }
+    return (top8 - (uint8_t *) ra);
+  }
+
+  if (w == 16) {
+    r16a = (uint16_t *) ra; 
+    r16b = (uint16_t *) rb; 
+    top16 = (uint16_t *) top;
+    if (test == 'M') {
+      while (r16a < top16) {
+        *r16a = gf->multiply.w32(gf, *r16a, *r16b);
+        r16a++;
+        r16b++;
+      }
+    } else if (test == 'D') {
+      while (r16a < top16) {
+        *r16a = gf->divide.w32(gf, *r16a, *r16b);
+        r16a++;
+        r16b++;
+      }
+    } else if (test == 'I') {
+      while (r16a < top16) {
+        *r16a = gf->inverse.w32(gf, *r16a);
+        r16a++;
+      }
+    }
+    return (top16 - (uint16_t *) ra);
+  }
+  if (w <= 32) {
+    r32a = (uint32_t *) ra; 
+    r32b = (uint32_t *) rb; 
+    top32 = (uint32_t *) ra + (size/4); /* This is for the "everything elses" */
+    
+    if (test == 'M') {
+      while (r32a < top32) {
+        *r32a = gf->multiply.w32(gf, *r32a, *r32b);
+        r32a++;
+        r32b++;
+      }
+    } else if (test == 'D') {
+      while (r32a < top32) {
+        *r32a = gf->divide.w32(gf, *r32a, *r32b);
+        r32a++;
+        r32b++;
+      }
+    } else if (test == 'I') {
+      while (r32a < top32) {
+        *r32a = gf->inverse.w32(gf, *r32a);
+        r32a++;
+      }
+    }
+    return (top32 - (uint32_t *) ra);
+  }
+  if (w == 64) {
+    r64a = (uint64_t *) ra; 
+    r64b = (uint64_t *) rb; 
+    top64 = (uint64_t *) top;
+    if (test == 'M') {
+      while (r64a < top64) {
+        *r64a = gf->multiply.w64(gf, *r64a, *r64b);
+        r64a++;
+        r64b++;
+      }
+    } else if (test == 'D') {
+      while (r64a < top64) {
+        *r64a = gf->divide.w64(gf, *r64a, *r64b);
+        r64a++;
+        r64b++;
+      }
+    } else if (test == 'I') {
+      while (r64a < top64) {
+        *r64a = gf->inverse.w64(gf, *r64a);
+        r64a++;
+      }
+    }
+    return (top64 - (uint64_t *) ra);
+  }
+  if (w == 128) {
+    r64a = (uint64_t *) ra; 
+    r64c = r64a;
+    r64a += 2;
+    r64b = (uint64_t *) rb; 
+    top64 = (uint64_t *) top;
+    rv = (top64 - r64a)/2;
+    if (test == 'M') {
+      while (r64a < top64) {
+        gf->multiply.w128(gf, r64a, r64b, r64c);
+        r64a += 2;
+        r64b += 2;
+      }
+    } else if (test == 'D') {
+      while (r64a < top64) {
+        gf->divide.w128(gf, r64a, r64b, r64c);
+        r64a += 2;
+        r64b += 2;
+      }
+    } else if (test == 'I') {
+      while (r64a < top64) {
+        gf->inverse.w128(gf, r64a, r64c);
+        r64a += 2;
+      }
+    }
+    return rv;
+  }
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_method.c b/src/erasure-code/jerasure/gf-complete/src/gf_method.c
new file mode 100644
index 0000000..2210305
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_method.c
@@ -0,0 +1,193 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_method.c
+ *
+ * Parses argv to figure out the mult_type and arguments.  Returns the gf.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_int.h"
+#include "gf_method.h"
+
+int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
+{
+  int mult_type, divide_type, region_type;
+  int arg1, arg2;
+  uint64_t prim_poly;
+  gf_t *base;
+
+  mult_type = GF_MULT_DEFAULT;
+  region_type = GF_REGION_DEFAULT;
+  divide_type = GF_DIVIDE_DEFAULT;
+  prim_poly = 0;
+  base = NULL;
+  arg1 = 0;
+  arg2 = 0;
+  while (1) {
+    if (argc > starting) {
+      if (strcmp(argv[starting], "-m") == 0) {
+        starting++;
+        if (mult_type != GF_MULT_DEFAULT) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_TWOMULT;
+          return 0;
+        }
+        if (strcmp(argv[starting], "SHIFT") == 0) {
+          mult_type = GF_MULT_SHIFT;
+          starting++;
+        } else if (strcmp(argv[starting], "CARRY_FREE") == 0) {
+          mult_type = GF_MULT_CARRY_FREE;
+          starting++;
+        } else if (strcmp(argv[starting], "CARRY_FREE_GK") == 0) {
+          mult_type = GF_MULT_CARRY_FREE_GK;
+          starting++;
+        } else if (strcmp(argv[starting], "GROUP") == 0) {
+          mult_type = GF_MULT_GROUP;
+          if (argc < starting + 3) {
+            _gf_errno = GF_E_GROUPAR;
+            return 0;
+          }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+              sscanf(argv[starting+2], "%d", &arg2) == 0) {
+            _gf_errno = GF_E_GROUPNU;
+            return 0;
+          }
+          starting += 3;
+        } else if (strcmp(argv[starting], "BYTWO_p") == 0) {
+          mult_type = GF_MULT_BYTWO_p;
+          starting++;
+        } else if (strcmp(argv[starting], "BYTWO_b") == 0) {
+          mult_type = GF_MULT_BYTWO_b;
+          starting++;
+        } else if (strcmp(argv[starting], "TABLE") == 0) {
+          mult_type = GF_MULT_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG") == 0) {
+          mult_type = GF_MULT_LOG_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG_ZERO") == 0) {
+          mult_type = GF_MULT_LOG_ZERO;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) {
+          mult_type = GF_MULT_LOG_ZERO_EXT;
+          starting++;
+        } else if (strcmp(argv[starting], "SPLIT") == 0) {
+          mult_type = GF_MULT_SPLIT_TABLE;
+          if (argc < starting + 3) {
+            _gf_errno = GF_E_SPLITAR;
+            return 0;
+          }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+              sscanf(argv[starting+2], "%d", &arg2) == 0) {
+            _gf_errno = GF_E_SPLITNU;
+            return 0;
+          }
+          starting += 3;
+        } else if (strcmp(argv[starting], "COMPOSITE") == 0) {
+          mult_type = GF_MULT_COMPOSITE;
+          if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0) {
+            _gf_errno = GF_E_COMP_A2;
+            return 0;
+          }
+          starting += 2;
+          base = (gf_t *) malloc(sizeof(gf_t));
+          starting = create_gf_from_argv(base, w/arg1, argc, argv, starting);
+          if (starting == 0) {
+            free(base);
+            return 0;
+          }
+        } else {
+          _gf_errno = GF_E_UNKNOWN;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-r") == 0) {
+        starting++;
+        if (strcmp(argv[starting], "DOUBLE") == 0) {
+          region_type |= GF_REGION_DOUBLE_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "QUAD") == 0) {
+          region_type |= GF_REGION_QUAD_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LAZY") == 0) {
+          region_type |= GF_REGION_LAZY;
+          starting++;
+        } else if (strcmp(argv[starting], "SIMD") == 0) {
+          region_type |= GF_REGION_SIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "NOSIMD") == 0) {
+          region_type |= GF_REGION_NOSIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "SSE") == 0) {
+          region_type |= GF_REGION_SIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "NOSSE") == 0) {
+          region_type |= GF_REGION_NOSIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "CAUCHY") == 0) {
+          region_type |= GF_REGION_CAUCHY;
+          starting++;
+        } else if (strcmp(argv[starting], "ALTMAP") == 0) {
+          region_type |= GF_REGION_ALTMAP;
+          starting++;
+        } else {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_UNK_REG;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-p") == 0) {
+        starting++;
+        if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_POLYSPC;
+          return 0;
+        }
+        starting++;
+      } else if (strcmp(argv[starting], "-d") == 0) {
+        starting++;
+        if (divide_type != GF_DIVIDE_DEFAULT) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_TWO_DIV;
+          return 0;
+        } else if (strcmp(argv[starting], "EUCLID") == 0) {
+          divide_type = GF_DIVIDE_EUCLID;
+          starting++;
+        } else if (strcmp(argv[starting], "MATRIX") == 0) {
+          divide_type = GF_DIVIDE_MATRIX;
+          starting++;
+        } else {
+          _gf_errno = GF_E_UNK_DIV;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-") == 0) {
+         /*
+         printf("Scratch size: %d\n", gf_scratch_size(w, 
+                                      mult_type, region_type, divide_type, arg1, arg2));
+         */
+        if (gf_init_hard(gf, w, mult_type, region_type, divide_type, 
+                         prim_poly, arg1, arg2, base, NULL) == 0) {
+          if (base != NULL) gf_free(base, 1);
+          return 0;
+        } else
+          return starting + 1;
+      } else {
+        if (base != NULL) gf_free(base, 1);
+        _gf_errno = GF_E_UNKFLAG;
+        return 0;
+      }
+    } else {
+      if (base != NULL) gf_free(base, 1);
+      _gf_errno = GF_E_FEWARGS;
+      return 0;
+    }
+  }
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_rand.c b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c
new file mode 100644
index 0000000..a9aa7ad
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c
@@ -0,0 +1,80 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_rand.c -- Random number generator.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "gf_rand.h"
+
+/* Lifted the "Mother of All" random number generator from http://www.agner.org/random/ */
+
+static uint32_t MOA_X[5];
+
+uint32_t MOA_Random_32() {
+  uint64_t sum;
+  sum = (uint64_t)2111111111UL * (uint64_t)MOA_X[3] +
+     (uint64_t)1492 * (uint64_t)(MOA_X[2]) +
+     (uint64_t)1776 * (uint64_t)(MOA_X[1]) +
+     (uint64_t)5115 * (uint64_t)(MOA_X[0]) +
+     (uint64_t)MOA_X[4];
+  MOA_X[3] = MOA_X[2];  MOA_X[2] = MOA_X[1];  MOA_X[1] = MOA_X[0];
+  MOA_X[4] = (uint32_t)(sum >> 32);
+  MOA_X[0] = (uint32_t)sum;
+  return MOA_X[0];
+}
+
+uint64_t MOA_Random_64() {
+  uint64_t sum;
+
+  sum = MOA_Random_32();
+  sum <<= 32;
+  sum |= MOA_Random_32();
+  return sum;
+}
+
+void MOA_Random_128(uint64_t *x) {
+  x[0] = MOA_Random_64();
+  x[1] = MOA_Random_64();
+  return;
+}
+
+uint32_t MOA_Random_W(int w, int zero_ok)
+{
+  uint32_t b;
+
+  do {
+    b = MOA_Random_32();
+    if (w == 31) b &= 0x7fffffff;
+    if (w < 31)  b %= (1 << w);
+  } while (!zero_ok && b == 0);
+  return b;
+}
+
+void MOA_Seed(uint32_t seed) {
+  int i;
+  uint32_t s = seed;
+  for (i = 0; i < 5; i++) {
+    s = s * 29943829 - 1;
+    MOA_X[i] = s;
+  }
+  for (i=0; i<19; i++) MOA_Random_32();
+}
+
+
+void MOA_Fill_Random_Region (void *reg, int size)
+{
+  uint32_t *r32;
+  uint8_t *r8;
+  int i;
+
+  r32 = (uint32_t *) reg;
+  r8 = (uint8_t *) reg;
+  for (i = 0; i < size/4; i++) r32[i] = MOA_Random_32();
+  for (i *= 4; i < size; i++) r8[i] = MOA_Random_W(8, 1);
+}
+
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w128.c b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c
new file mode 100644
index 0000000..b6cfeba
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c
@@ -0,0 +1,1783 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w128.c
+ *
+ * Routines for 128-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define GF_FIELD_WIDTH (128)
+
+#define two_x(a) {\
+  a[0] <<= 1; \
+  if (a[1] & 1ULL << 63) a[0] ^= 1; \
+  a[1] <<= 1; }
+  
+#define a_get_b(a, i, b, j) {\
+  a[i] = b[j]; \
+  a[i + 1] = b[j + 1];}
+
+#define set_zero(a, i) {\
+  a[i] = 0; \
+  a[i + 1] = 0;}
+
+struct gf_w128_split_4_128_data {
+  uint64_t last_value[2];
+  uint64_t tables[2][32][16];
+};
+
+struct gf_w128_split_8_128_data {
+  uint64_t last_value[2];
+  uint64_t tables[2][16][256];
+};
+
+typedef struct gf_group_tables_s {
+  gf_val_128_t m_table;
+  gf_val_128_t r_table;
+} gf_group_tables_t;
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+static
+void
+gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
+int xor)
+{
+    uint32_t i;
+    gf_val_128_t s128;
+    gf_val_128_t d128;
+    uint64_t c128[2];
+    gf_region_data rd;
+
+    /* We only do this to check on alignment. */
+    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+    if (val[0] == 0) {
+      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+    }
+
+    set_zero(c128, 0);
+
+    s128 = (gf_val_128_t) src;
+    d128 = (gf_val_128_t) dest;
+
+    if (xor) {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        gf->multiply.w128(gf, &s128[i], val, c128);
+        d128[i] ^= c128[0];
+        d128[i+1] ^= c128[1];
+      }
+    } else {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        gf->multiply.w128(gf, &s128[i], val, &d128[i]);
+      }
+    }
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
+int xor)
+{
+    uint32_t i;
+    gf_val_128_t s128;
+    gf_val_128_t d128;
+    gf_region_data rd;
+    __m128i     a,b;
+    __m128i     result0,result1;
+    __m128i     prim_poly;
+    __m128i     c,d,e,f;
+    gf_internal_t * h = gf->scratch;
+    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+    /* We only do this to check on alignment. */
+    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+    if (val[0] == 0) {
+      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+    }
+
+    s128 = (gf_val_128_t) src;
+    d128 = (gf_val_128_t) dest;
+
+    if (xor) {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
+        b = _mm_insert_epi64 (a, val[1], 0);
+        a = _mm_insert_epi64 (a, s128[i], 1);
+        b = _mm_insert_epi64 (b, val[0], 1);
+    
+        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+
+        /* now reusing a and b as temporary variables*/
+        result0 = _mm_setzero_si128();
+        result1 = result0;
+
+        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */
+
+        a = _mm_srli_si128 (result0, 8);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+        a = _mm_insert_epi64 (result0, 0, 1);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result1 = _mm_xor_si128 (result1, b); 
+        d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
+        d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
+      }
+    } else {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
+        b = _mm_insert_epi64 (a, val[1], 0);
+        a = _mm_insert_epi64 (a, s128[i], 1);
+        b = _mm_insert_epi64 (b, val[0], 1);
+
+        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ 
+        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ 
+
+        /* now reusing a and b as temporary variables*/
+        result0 = _mm_setzero_si128();
+        result1 = result0;
+
+        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
+
+        a = _mm_srli_si128 (result0, 8);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+        a = _mm_insert_epi64 (result0, 0, 1);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result1 = _mm_xor_si128 (result1, b);
+        d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
+        d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
+      }
+    }
+}
+#endif
+
+/*
+ * Some w128 notes:
+ * --Big Endian
+ * --return values allocated beforehand
+ */
+
+#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
+
+void
+gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */
+  uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) {
+    set_zero(c128, 0);
+    return;
+  }
+
+  a_get_b(a, 0, a128, 0);
+  a_get_b(br, 0, b128, 0);
+  set_zero(bl, 0);
+
+  one = 1;
+  lbit = (one << 63);
+
+  set_zero(pl, 0);
+  set_zero(pr, 0);
+
+  /* Allen: a*b for right half of a */
+  for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
+    if (a[1] & (one << i)) {
+      pl[1] ^= bl[1];
+      pr[0] ^= br[0];
+      pr[1] ^= br[1];
+    }
+    bl[1] <<= 1;
+    if (br[0] & lbit) bl[1] ^= 1;
+    br[0] <<= 1;
+    if (br[1] & lbit) br[0] ^= 1;
+    br[1] <<= 1;
+  }
+
+  /* Allen: a*b for left half of a */
+  for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
+    if (a[0] & (one << i)) {
+      pl[0] ^= bl[0];
+      pl[1] ^= bl[1];
+      pr[0] ^= br[0];
+    }
+    bl[0] <<= 1;
+    if (bl[1] & lbit) bl[0] ^= 1;
+    bl[1] <<= 1;
+    if (br[0] & lbit) bl[1] ^= 1;
+    br[0] <<= 1;
+  }
+
+  /* Allen: do first half of reduction (based on left quarter of initial product) */
+  one = lbit >> 1;
+  ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */
+  ppl[1] = h->prim_poly >> 2;
+  ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2);
+  ppr[1] = 0;
+  while (one != 0) {
+    if (pl[0] & one) {
+      pl[0] ^= ppl[0];
+      pl[1] ^= ppl[1];
+      pr[0] ^= ppr[0];
+      pr[1] ^= ppr[1];
+    }
+    one >>= 1;
+    ppr[1] >>= 1;
+    if (ppr[0] & 1) ppr[1] ^= lbit;
+    ppr[0] >>= 1;
+    if (ppl[1] & 1) ppr[0] ^= lbit;
+    ppl[1] >>= 1;
+    if (ppl[0] & 1) ppl[1] ^= lbit;
+    ppl[0] >>= 1;
+  }
+
+  /* Allen: final half of reduction */
+  one = lbit;
+  while (one != 0) {
+    if (pl[1] & one) {
+      pl[1] ^= ppl[1];
+      pr[0] ^= ppr[0];
+      pr[1] ^= ppr[1];
+    }
+    one >>= 1;
+    ppr[1] >>= 1;
+    if (ppr[0] & 1) ppr[1] ^= lbit;
+    ppr[0] >>= 1;
+    if (ppl[1] & 1) ppr[0] ^= lbit;
+    ppl[1] >>= 1;
+  }
+
+  /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */
+  c128[0] = pr[0];
+  c128[1] = pr[1];
+
+  return;
+}
+
+void
+gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#if defined(INTEL_SSE4_PCLMUL)
+
+    __m128i     a,b;
+    __m128i     result0,result1;
+    __m128i     prim_poly;
+    __m128i     c,d,e,f;
+    gf_internal_t * h = gf->scratch;
+    
+    a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
+    b = _mm_insert_epi64 (a, b128[1], 0);
+    a = _mm_insert_epi64 (a, a128[0], 1);
+    b = _mm_insert_epi64 (b, b128[0], 1);
+
+    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+
+    /* we need to test algorithm 2 later*/
+    c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+    f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+    e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+    d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+    
+    /* now reusing a and b as temporary variables*/
+    result0 = _mm_setzero_si128();
+    result1 = result0;
+
+    result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+    a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+    result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+    a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+    result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+    result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+    /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
+    
+    a = _mm_srli_si128 (result0, 8);
+    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+    result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+    result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+    
+    a = _mm_insert_epi64 (result0, 0, 1);
+    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+    result1 = _mm_xor_si128 (result1, b);
+
+    c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
+    c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
+#endif
+return;
+}
+
+void
+gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
+  uint64_t topbit; /* this is used as a boolean value */
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  prod[0] = 0;
+  prod[1] = 0;
+  pmask = 0x8000000000000000ULL;
+  amask[0] = 0x8000000000000000ULL;
+  amask[1] = 0;
+
+  while (amask[1] != 0 || amask[0] != 0) {
+    topbit = (prod[0] & pmask);
+    prod[0] <<= 1;
+    if (prod[1] & pmask) prod[0] ^= 1;
+    prod[1] <<= 1;
+    if (topbit) prod[1] ^= pp;
+    if ((a128[0] & amask[0]) || (a128[1] & amask[1])) {
+      prod[0] ^= b128[0];
+      prod[1] ^= b128[1];
+    }
+    amask[1] >>= 1;
+    if (amask[0] & 1) amask[1] ^= pmask;
+    amask[0] >>= 1;
+  }
+  c128[0] = prod [0];
+  c128[1] = prod [1];
+  return;
+}
+
+void
+gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#if defined(INTEL_SSE4)
+  int i;
+  __m128i a, b, pp, prod, amask, u_middle_one; 
+  /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
+  uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
+  gf_internal_t *h;
+
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+  prod = _mm_setzero_si128();
+  a = _mm_insert_epi64(prod, a128[1], 0x0);
+  a = _mm_insert_epi64(a, a128[0], 0x1);
+  b = _mm_insert_epi64(prod, b128[1], 0x0);
+  b = _mm_insert_epi64(b, b128[0], 0x1);
+  pmask = 0x80000000;
+  amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
+  u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
+  
+  for (i = 0; i < 64; i++) {
+    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
+    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
+    prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
+    if (middlebit) {
+      prod = _mm_xor_si128(prod, u_middle_one);
+    }
+    if (topbit) {
+      prod = _mm_xor_si128(prod, pp);
+    }
+    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
+      prod = _mm_xor_si128(prod, b);
+    }
+    amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
+  }
+  amask = _mm_insert_epi32(amask, 1 << 31, 0x1);
+  for (i = 64; i < 128; i++) {
+    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
+    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
+    prod = _mm_slli_epi64(prod, 1);
+    if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
+    if (topbit) prod = _mm_xor_si128(prod, pp);
+    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
+      prod = _mm_xor_si128(prod, b);
+    }
+    amask = _mm_srli_epi64(amask, 1);
+  }
+  c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
+  c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
+#endif
+  return;
+}
+
+
+/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
+void
+gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#if defined(INTEL_SSE4)
+  __m128i a, b, lmask, hmask, pp, c, middle_one;
+  gf_internal_t *h;
+  uint64_t topbit, middlebit;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  c = _mm_setzero_si128();
+  lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
+  hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
+  b = _mm_insert_epi64(c, a128[0], 1);
+  b = _mm_insert_epi64(b, a128[1], 0);
+  a = _mm_insert_epi64(c, b128[0], 1);
+  a = _mm_insert_epi64(a, b128[1], 0);
+  pp = _mm_insert_epi64(c, h->prim_poly, 0);
+  middle_one = _mm_insert_epi64(c, 1, 0x1);
+
+  while (1) {
+    if (_mm_extract_epi32(a, 0x0) & 1) {
+      c = _mm_xor_si128(c, b);
+    }
+    middlebit = (_mm_extract_epi32(a, 0x2) & 1);
+    a = _mm_srli_epi64(a, 1);
+    if (middlebit) a = _mm_xor_si128(a, lmask);
+    if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
+      c128[0] = _mm_extract_epi64(c, 0x1);
+      c128[1] = _mm_extract_epi64(c, 0x0);
+      return;
+    }
+    topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
+    middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
+    b = _mm_slli_epi64(b, 1);
+    if (middlebit) b = _mm_xor_si128(b, middle_one);
+    if (topbit) b = _mm_xor_si128(b, pp);
+  }
+#endif
+}
+
+void
+gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  uint64_t bmask, pp;
+  gf_internal_t *h;
+  uint64_t a[2], b[2], c[2];
+
+  h = (gf_internal_t *) gf->scratch;
+
+  bmask = (1ULL << 63);
+  set_zero(c, 0);
+  b[0] = a128[0];
+  b[1] = a128[1];
+  a[0] = b128[0];
+  a[1] = b128[1];
+  
+  while (1) {
+    if (a[1] & 1) {
+      c[0] ^= b[0];
+      c[1] ^= b[1];
+    }
+    a[1] >>= 1;
+    if (a[0] & 1) a[1] ^= bmask;
+    a[0] >>= 1;
+    if (a[1] == 0 && a[0] == 0) {
+      c128[0] = c[0];
+      c128[1] = c[1];
+      return;
+    }
+    pp = (b[0] & bmask);
+    b[0] <<= 1;
+    if (b[1] & bmask) b[0] ^= 1;
+    b[1] <<= 1;
+    if (pp) b[1] ^= h->prim_poly;
+  }
+}
+
+static
+void
+gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  int i, j, k;
+  uint64_t pp;
+  gf_internal_t *h;
+  uint64_t *s64, *d64, *top;
+  gf_region_data rd;
+  uint64_t v[2], s;
+  struct gf_w128_split_4_128_data *ld;
+
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  h = (gf_internal_t *) gf->scratch;
+  ld = (struct gf_w128_split_4_128_data *) h->private;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 32; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+/*
+  for (i = 0; i < 32; i++) {
+    for (j = 0; j < 16; j++) {
+      printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]);
+    }
+    printf("\n");
+  }
+ */
+  while (d64 < top) {
+    v[0] = (xor) ? d64[0] : 0;
+    v[1] = (xor) ? d64[1] : 0;
+    s = s64[1];
+    i = 0;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xf];
+      v[1] ^= ld->tables[1][i][s&0xf];
+      s >>= 4;
+      i++;
+    }
+    s = s64[0];
+    i = 16;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xf];
+      v[1] ^= ld->tables[1][i][s&0xf];
+      s >>= 4;
+      i++;
+    }
+    d64[0] = v[0];
+    d64[1] = v[1];
+    s64 += 2;
+    d64 += 2;
+  }
+}
+
+#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
+static
+void
+gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v[2], s, *s64, *d64, *top;
+  __m128i p, tables[32][16];
+  struct gf_w128_split_4_128_data *ld;
+  gf_region_data rd;
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16);
+
+  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_w128_split_4_128_data *) h->private;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 32; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+  for (i = 0; i < 32; i++) {
+    for (j = 0; j < 16; j++) {
+      v[0] = ld->tables[0][i][j];
+      v[1] = ld->tables[1][i][j];
+      tables[i][j] = _mm_loadu_si128((__m128i *) v);
+
+/*
+      printf("%2d %2d: ", i, j);
+      MM_PRINT8("", tables[i][j]); */
+    }
+  }
+
+  while (d64 != top) {
+
+    if (xor) {
+      p = _mm_load_si128 ((__m128i *) d64);
+    } else {
+      p = _mm_setzero_si128();
+    }
+    s = *s64;
+    s64++;
+    for (i = 0; i < 16; i++) {
+      j = (s&0xf);
+      s >>= 4;
+      p = _mm_xor_si128(p, tables[16+i][j]);
+    }
+    s = *s64;
+    s64++;
+    for (i = 0; i < 16; i++) {
+      j = (s&0xf);
+      s >>= 4;
+      p = _mm_xor_si128(p, tables[i][j]);
+    }
+    _mm_store_si128((__m128i *) d64, p);
+    d64 += 2;
+  }
+
+  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
+}
+#endif
+
+#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
+static
+void
+gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v[2], *s64, *d64, *top;
+  __m128i si, tables[32][16], p[16], v0, mask1;
+  struct gf_w128_split_4_128_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);
+
+  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_w128_split_4_128_data *) h->private;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 32; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+  for (i = 0; i < 32; i++) {
+    for (j = 0; j < 16; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
+        ld->tables[1-(j/8)][i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+/*
+      printf("%2d %2d: ", i, j);
+      MM_PRINT8("", tables[i][j]);
+ */
+    }
+  }
+
+
+  mask1 = _mm_set1_epi8(0xf);
+
+  while (d64 != top) {
+
+    if (xor) {
+      for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
+    } else {
+      for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
+    }
+    i = 0;
+    for (k = 0; k < 16; k++) {
+      v0 = _mm_load_si128((__m128i *) s64); 
+      s64 += 2;
+      
+      si = _mm_and_si128(v0, mask1);
+  
+      for (j = 0; j < 16; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      for (j = 0; j < 16; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+    }
+    for (i = 0; i < 16; i++) {
+      _mm_store_si128((__m128i *) d64, p[i]);
+      d64 += 2;
+    }
+  }
+  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
+}
+#endif
+
+static
+void
+gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  int i, j, k;
+  uint64_t pp;
+  gf_internal_t *h;
+  uint64_t *s64, *d64, *top;
+  gf_region_data rd;
+  uint64_t v[2], s;
+  struct gf_w128_split_8_128_data *ld;
+
+  /* Check on alignment. Ignore it otherwise. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  h = (gf_internal_t *) gf->scratch;
+  ld = (struct gf_w128_split_8_128_data *) h->private;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 16; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < (1 << 8); j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+  while (d64 < top) {
+    v[0] = (xor) ? d64[0] : 0;
+    v[1] = (xor) ? d64[1] : 0;
+    s = s64[1];
+    i = 0;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xff];
+      v[1] ^= ld->tables[1][i][s&0xff];
+      s >>= 8;
+      i++;
+    }
+    s = s64[0];
+    i = 8;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xff];
+      v[1] ^= ld->tables[1][i][s&0xff];
+      s >>= 8;
+      i++;
+    }
+    d64[0] = v[0];
+    d64[1] = v[1];
+    s64 += 2;
+    d64 += 2;
+  }
+}
+
+void
+gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  uint64_t bmask, pp;
+  gf_internal_t *h;
+  uint64_t a[2], c[2], b[2], *s64, *d64, *top;
+  gf_region_data rd;
+
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  h = (gf_internal_t *) gf->scratch;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+  bmask = (1ULL << 63);
+
+  while (d64 < top) {
+    set_zero(c, 0);
+    b[0] = s64[0];
+    b[1] = s64[1];
+    a[0] = val[0];
+    a[1] = val[1];
+
+    while (a[0] != 0) {
+      if (a[1] & 1) {
+        c[0] ^= b[0];
+        c[1] ^= b[1];
+      }
+      a[1] >>= 1;
+      if (a[0] & 1) a[1] ^= bmask;
+      a[0] >>= 1;
+      pp = (b[0] & bmask);
+      b[0] <<= 1;
+      if (b[1] & bmask) b[0] ^= 1;    
+      b[1] <<= 1;
+      if (pp) b[1] ^= h->prim_poly;
+    }
+    while (1) {
+      if (a[1] & 1) {
+        c[0] ^= b[0];
+        c[1] ^= b[1];
+      }
+      a[1] >>= 1;
+      if (a[1] == 0) break;
+      pp = (b[0] & bmask);
+      b[0] <<= 1;
+      if (b[1] & bmask) b[0] ^= 1;    
+      b[1] <<= 1;
+      if (pp) b[1] ^= h->prim_poly;
+    }
+    if (xor) {
+      d64[0] ^= c[0];
+      d64[1] ^= c[1];
+    } else {
+      d64[0] = c[0];
+      d64[1] = c[1];
+    }
+    s64 += 2;
+    d64 += 2;
+  }
+}
+
+static
+void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
+{
+  int i, j;
+  int g_m;
+  uint64_t prim_poly, lbit;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  uint64_t a128[2];
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_m = scratch->arg1;
+  prim_poly = scratch->prim_poly;
+
+
+  set_zero(gt->m_table, 0);
+  a_get_b(gt->m_table, 2, b128, 0);
+  lbit = 1;
+  lbit <<= 63;
+
+  for (i = 2; i < (1 << g_m); i <<= 1) {
+    a_get_b(a128, 0, gt->m_table, 2 * (i >> 1));
+    two_x(a128);
+    a_get_b(gt->m_table, 2 * i, a128, 0);
+    if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly;
+    for (j = 0; j < i; j++) {
+      gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)];
+      gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1];
+    }
+  }
+  return;
+}
+
+void
+gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  int i;
+  /* index_r, index_m, total_m (if g_r > g_m) */
+  int i_r, i_m, t_m;
+  int mask_m, mask_r;
+  int g_m, g_r;
+  uint64_t p_i[2], a[2];
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
+
+  mask_m = (1 << g_m) - 1;
+  mask_r = (1 << g_r) - 1;
+
+  if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) {
+    gf_w128_group_m_init(gf, b128);
+  }
+  
+  p_i[0] = 0;
+  p_i[1] = 0;
+  a[0] = a128[0];
+  a[1] = a128[1];
+
+  t_m = 0;
+  i_r = 0;
+
+  /* Top 64 bits */
+  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+    i_m = (a[0] >> (i * g_m)) & mask_m;
+    i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
+    p_i[0] <<= g_m;
+    p_i[0] ^= (p_i[1] >> (64-g_m));
+    p_i[1] <<= g_m;
+    p_i[0] ^= gt->m_table[2 * i_m];
+    p_i[1] ^= gt->m_table[(2 * i_m) + 1];
+    t_m += g_m;
+    if (t_m == g_r) {
+      p_i[1] ^= gt->r_table[i_r];
+      t_m = 0;
+      i_r = 0;
+    } else {
+      i_r <<= g_m;
+    }
+  }
+
+  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+    i_m = (a[1] >> (i * g_m)) & mask_m;
+    i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
+    p_i[0] <<= g_m;
+    p_i[0] ^= (p_i[1] >> (64-g_m));
+    p_i[1] <<= g_m;
+    p_i[0] ^= gt->m_table[2 * i_m];
+    p_i[1] ^= gt->m_table[(2 * i_m) + 1];
+    t_m += g_m;
+    if (t_m == g_r) {
+      p_i[1] ^= gt->r_table[i_r];
+      t_m = 0;
+      i_r = 0;
+    } else {
+      i_r <<= g_m;
+    }
+  }
+  c128[0] = p_i[0];
+  c128[1] = p_i[1];
+}
+
+static
+void
+gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  int i;
+  int i_r, i_m, t_m;
+  int mask_m, mask_r;
+  int g_m, g_r;
+  uint64_t p_i[2], a[2];
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  gf_region_data rd;
+  uint64_t *a128, *c128, *top;
+
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+      
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
+
+  mask_m = (1 << g_m) - 1;
+  mask_r = (1 << g_r) - 1;
+
+  if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) {
+    gf_w128_group_m_init(gf, val);
+  }
+
+  a128 = (uint64_t *) src;
+  c128 = (uint64_t *) dest;
+  top = (uint64_t *) rd.d_top;
+
+  while (c128 < top) {
+    p_i[0] = 0;
+    p_i[1] = 0;
+    a[0] = a128[0];
+    a[1] = a128[1];
+  
+    t_m = 0;
+    i_r = 0;
+  
+    /* Top 64 bits */
+    for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+      i_m = (a[0] >> (i * g_m)) & mask_m;
+      i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
+      p_i[0] <<= g_m;
+      p_i[0] ^= (p_i[1] >> (64-g_m));
+      p_i[1] <<= g_m;
+      
+      p_i[0] ^= gt->m_table[2 * i_m];
+      p_i[1] ^= gt->m_table[(2 * i_m) + 1];
+      t_m += g_m;
+      if (t_m == g_r) {
+        p_i[1] ^= gt->r_table[i_r];
+        t_m = 0;
+        i_r = 0;
+      } else {
+        i_r <<= g_m;
+      }
+    }
+    for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+      i_m = (a[1] >> (i * g_m)) & mask_m;
+      i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
+      p_i[0] <<= g_m;
+      p_i[0] ^= (p_i[1] >> (64-g_m));
+      p_i[1] <<= g_m;
+      p_i[0] ^= gt->m_table[2 * i_m];
+      p_i[1] ^= gt->m_table[(2 * i_m) + 1];
+      t_m += g_m;
+      if (t_m == g_r) {
+        p_i[1] ^= gt->r_table[i_r];
+        t_m = 0;
+        i_r = 0;
+      } else {
+        i_r <<= g_m;
+      }
+    }
+  
+    if (xor) {
+      c128[0] ^= p_i[0];
+      c128[1] ^= p_i[1];
+    } else {
+      c128[0] = p_i[0];
+      c128[1] = p_i[1];
+    }
+    a128 += 2;
+    c128 += 2;
+  }
+}
+
+/* a^-1 -> b */
+  void
+gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
+{
+  uint64_t e_i[2], e_im1[2], e_ip1[2];
+  uint64_t d_i, d_im1, d_ip1;
+  uint64_t y_i[2], y_im1[2], y_ip1[2];
+  uint64_t c_i[2];
+  uint64_t *b;
+  uint64_t one = 1;
+
+  /* This needs to return some sort of error (in b128?) */
+  if (a128[0] == 0 && a128[1] == 0) return;
+
+  b = (uint64_t *) b128;
+
+  e_im1[0] = 0;
+  e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i[0] = a128[0];
+  e_i[1] = a128[1];
+  d_im1 = 128;
+
+  //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit
+  //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a.
+  //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not
+
+  for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ;
+
+  //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet
+
+  if (!((one << d_i) & e_i[0])) {
+
+    //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a.
+    // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0.
+
+    for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ;
+
+  } else {
+
+    //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a.
+
+    d_i += 64;
+  }
+  y_i[0] = 0;
+  y_i[1] = 1;
+  y_im1[0] = 0;
+  y_im1[1] = 0;
+
+  while (!(e_i[0] == 0 && e_i[1] == 1)) {
+
+    e_ip1[0] = e_im1[0];
+    e_ip1[1] = e_im1[1];
+    d_ip1 = d_im1;
+    c_i[0] = 0;
+    c_i[1] = 0;
+
+    while (d_ip1 >= d_i) {
+      if ((d_ip1 - d_i) >= 64) {
+        c_i[0] ^= (one << ((d_ip1 - d_i) - 64));
+        e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64));
+      } else {
+        c_i[1] ^= (one << (d_ip1 - d_i));
+        e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i));
+        if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i)));
+        e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i));
+      }
+      d_ip1--;
+      if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; }
+      while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--;
+      while (d_ip1 <  64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--;
+    }
+    gf->multiply.w128(gf, c_i, y_i, y_ip1);
+    y_ip1[0] ^= y_im1[0];
+    y_ip1[1] ^= y_im1[1];
+
+    y_im1[0] = y_i[0];
+    y_im1[1] = y_i[1];
+
+    y_i[0] = y_ip1[0];
+    y_i[1] = y_ip1[1];
+
+    e_im1[0] = e_i[0];
+    e_im1[1] = e_i[1];
+    d_im1 = d_i;
+    e_i[0] = e_ip1[0];
+    e_i[1] = e_ip1[1];
+    d_i = d_ip1;
+  }
+
+  b[0] = y_i[0];
+  b[1] = y_i[1];
+  return;
+}
+
+  void
+gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  uint64_t d[2];
+  gf->inverse.w128(gf, b128, d);
+  gf->multiply.w128(gf, a128, d, c128);
+  return;
+}
+
+  void
+gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
+{
+  uint64_t one128[2];
+  one128[0] = 0;
+  one128[1] = 1;
+  gf->divide.w128(gf, one128, a128, b128);
+  return;
+}
+
+
+static
+  void
+gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t a0 = a[1];
+  uint64_t a1 = a[0];
+  uint64_t c0, c1, d, tmp;
+  uint64_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w64(base_gf, a1);
+    c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w64(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w64(base_gf, a1);
+    a0inv = base_gf->inverse.w64(base_gf, a0);
+
+    d = base_gf->multiply.w64(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w64(base_gf, tmp);
+
+    d = base_gf->multiply.w64(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w64(base_gf, d, a1inv);
+  }
+  inv[0] = c1;
+  inv[1] = c0;
+}
+
+static
+  void
+gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t b0 = b[1];
+  uint64_t b1 = b[0];
+  uint64_t a0 = a[1];
+  uint64_t a1 = a[0];
+  uint64_t a1b1;
+
+  a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+  rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+  rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ 
+    base_gf->multiply.w64(base_gf, a0, b1) ^ 
+    base_gf->multiply.w64(base_gf, a1b1, h->prim_poly);
+}
+
+static
+  void
+gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t b0 = val[1];
+  uint64_t b1 = val[0];
+  uint64_t *s64, *d64;
+  uint64_t *top;
+  uint64_t a0, a1, a1b1;
+  gf_region_data rd;
+
+  if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  s64 = rd.s_start;
+  d64 = rd.d_start;
+  top = rd.d_top;
+
+  if (xor) {
+    while (d64 < top) {
+      a1 = s64[0];
+      a0 = s64[1];
+      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+      d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+      d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ 
+          base_gf->multiply.w64(base_gf, a0, b1) ^ 
+          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
+      s64 += 2;
+      d64 += 2;
+    }
+  } else {
+    while (d64 < top) {
+      a1 = s64[0];
+      a0 = s64[1];
+      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+      d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+      d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ 
+          base_gf->multiply.w64(base_gf, a0, b1) ^ 
+          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
+      s64 += 2;
+      d64 += 2;
+    }
+  }
+}
+
+static
+void
+gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int 
+    xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;  gf_t *base_gf = h->base_gf;
+  gf_val_64_t val0 = val[1];
+  gf_val_64_t val1 = val[0];
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;
+  int sub_reg_size;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64);
+  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t*) rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+
+  base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1
+        ), sub_reg_size, 1);
+
+  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
+}
+
+
+  static
+int gf_w128_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt;   
+  } else {
+    gf->multiply_region.w128 = gf_w128_composite_multiply_region;
+  }
+
+  gf->multiply.w128 = gf_w128_composite_multiply;
+  gf->divide.w128 = gf_w128_divide_from_inverse;
+  gf->inverse.w128 = gf_w128_composite_inverse;
+
+  return 1;
+}
+
+static
+int gf_w128_cfm_init(gf_t *gf)
+{
+#if defined(INTEL_SSE4_PCLMUL)
+  gf->inverse.w128 = gf_w128_euclid;
+  gf->multiply.w128 = gf_w128_clm_multiply;
+  gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single;
+  return 1;
+#endif
+
+  return 0;
+}
+
+static
+int gf_w128_shift_init(gf_t *gf)
+{
+  gf->multiply.w128 = gf_w128_shift_multiply;
+  gf->inverse.w128 = gf_w128_euclid;
+  gf->multiply_region.w128 = gf_w128_multiply_region_from_single;
+  return 1;
+}
+
+  static
+int gf_w128_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    gf->multiply.w128 = gf_w128_bytwo_p_multiply;
+    /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/
+    /* John: the sse function is slower.*/
+  } else {
+    gf->multiply.w128 = gf_w128_bytwo_b_multiply;
+    /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply;
+Ben: This sse function is also slower. */
+  }
+  gf->inverse.w128 = gf_w128_euclid;
+  gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region;
+  return 1;
+}
+
+/*
+ * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64
+ * bits in all of these numbers.
+ */
+  static
+void gf_w128_group_r_init(gf_t *gf)
+{
+  int i, j;
+  int g_r;
+  uint64_t pp;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_r = scratch->arg2;
+  pp = scratch->prim_poly;
+
+  gt->r_table[0] = 0;
+  for (i = 1; i < (1 << g_r); i++) {
+    gt->r_table[i] = 0;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        gt->r_table[i] ^= (pp << j);
+      }
+    }
+  }
+  return;
+}
+
+#if 0 // defined(INTEL_SSE4)
+  static
+void gf_w128_group_r_sse_init(gf_t *gf)
+{
+  int i, j;
+  int g_r;
+  uint64_t pp;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  __m128i zero = _mm_setzero_si128();
+  __m128i *table = (__m128i *)(gt->r_table);
+  g_r = scratch->arg2;
+  pp = scratch->prim_poly;
+  table[0] = zero;
+  for (i = 1; i < (1 << g_r); i++) {
+    table[i] = zero;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
+      }
+    }
+  }
+  return;
+}
+#endif
+
+  static 
+int gf_w128_split_init(gf_t *gf)
+{
+  struct gf_w128_split_4_128_data *sd4;
+  struct gf_w128_split_8_128_data *sd8;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  gf->multiply.w128 = gf_w128_bytwo_p_multiply;
+#if defined(INTEL_SSE4_PCLMUL)
+  if (!(h->region_type & GF_REGION_NOSIMD)){
+    gf->multiply.w128 = gf_w128_clm_multiply;
+  }
+#endif
+
+  gf->inverse.w128 = gf_w128_euclid;
+
+  if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
+    sd8 = (struct gf_w128_split_8_128_data *) h->private;
+    sd8->last_value[0] = 0;
+    sd8->last_value[1] = 0;
+    gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region;
+  } else {
+    sd4 = (struct gf_w128_split_4_128_data *) h->private;
+    sd4->last_value[0] = 0;
+    sd4->last_value[1] = 0;
+    if((h->region_type & GF_REGION_ALTMAP))
+    {
+      #ifdef INTEL_SSE4
+        if(!(h->region_type & GF_REGION_NOSIMD))
+          gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
+        else
+          return 0;
+      #else
+        return 0;
+      #endif
+    }
+    else {
+      #ifdef INTEL_SSE4
+        if(!(h->region_type & GF_REGION_NOSIMD))
+          gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region;
+        else
+          gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
+      #else
+      gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
+      #endif
+    }
+  }
+  return 1;
+}
+
+
+static
+int gf_w128_group_init(gf_t *gf)
+{
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  int g_r, size_r;
+
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_r = scratch->arg2;
+  size_r = (1 << g_r);
+
+  gt->r_table = (gf_val_128_t)((uint8_t *)scratch->private + (2 * sizeof(uint64_t *)));
+  gt->m_table = gt->r_table + size_r;
+  gt->m_table[2] = 0;
+  gt->m_table[3] = 0;
+
+  gf->multiply.w128 = gf_w128_group_multiply;
+  gf->inverse.w128 = gf_w128_euclid;
+  gf->multiply_region.w128 = gf_w128_group_multiply_region;
+
+  gf_w128_group_r_init(gf);
+
+  return 1;
+}
+
+void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+  gf_val_128_t s;
+
+  s = (gf_val_128_t) start;
+  s += (index * 2); 
+  memcpy(rv, s, 16);
+}
+
+static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+  int i, blocks;
+  uint64_t *r64, tmp;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256);
+  r64 = (uint64_t *) start;
+  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
+      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
+    memcpy(rv, r64+(index*2), 16);
+    return;
+  }
+
+  index -= (((uint64_t *) rd.d_start) - r64)/2;
+  r64 = (uint64_t *) rd.d_start;
+
+  blocks = index/16;
+  r64 += (blocks*32);
+  index %= 16;
+  r8 = (uint8_t *) r64;
+  r8 += index;
+  rv[0] = 0;
+  rv[1] = 0;
+
+  for (i = 0; i < 8; i++) {
+    tmp = *r8;
+    rv[1] |= (tmp << (i*8));
+    r8 += 16;
+  }
+
+  for (i = 0; i < 8; i++) {
+    tmp = *r8;
+    rv[0] |= (tmp << (i*8));
+    r8 += 16;
+  }
+  return;
+}
+
+  static
+void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint64_t *r64;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
+  r64 = (uint64_t *) start;
+  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
+      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
+    memcpy(rv, r64+(index*2), 16);
+    return;
+  }
+  index -= (((uint64_t *) rd.d_start) - r64)/2;
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index);
+  rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index);
+  
+  return;
+}
+
+int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  int size_m, size_r;
+  if (divide_type==GF_DIVIDE_MATRIX) return 0;
+
+  switch(mult_type)
+  {
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE:
+      if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
+      } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64;
+      }
+      return 0;
+      break;
+    case GF_MULT_GROUP:
+      /* JSP We've already error checked the arguments. */
+      size_m = (1 << arg1) * 2 * sizeof(uint64_t);
+      size_r = (1 << arg2) * 2 * sizeof(uint64_t);
+      /* 
+       * two pointers prepend the table data for structure
+       * because the tables are of dynamic size
+       */
+      return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *);
+      break;
+    case GF_MULT_COMPOSITE:
+      if (arg1 == 2) {
+        return sizeof(gf_internal_t) + 4;
+      } else {
+        return 0;
+      }
+      break;
+
+    default:
+      return 0;
+   }
+}
+
+int gf_w128_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  int no_default_flag = 0;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else {
+      h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
+    }
+    if (no_default_flag == 1) {
+      fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n");
+      return 0;
+    }
+  }
+
+  gf->multiply.w128 = NULL;
+  gf->divide.w128 = NULL;
+  gf->inverse.w128 = NULL;
+  gf->multiply_region.w128 = NULL;
+  switch(h->mult_type) {
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:      if (gf_w128_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:   if (gf_w128_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:        if (gf_w128_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:        if (gf_w128_group_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE:  if (gf_w128_split_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:    if (gf_w128_composite_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+
+  /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there
+     are multiple flags in h->region_type */
+  if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) {
+    gf->extract_word.w128 = gf_w128_split_extract_word;
+  } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) {
+    gf->extract_word.w128 = gf_w128_composite_extract_word;
+  } else {
+    gf->extract_word.w128 = gf_w128_extract_word;
+  }
+
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    gf->divide.w128 = gf_w128_divide_from_inverse;
+  } 
+
+  if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) {
+    gf->divide.w128 = gf_w128_divide_from_inverse;
+  }
+  if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) {
+    gf->inverse.w128 = gf_w128_inverse_from_divide;
+  }
+  return 1;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w16.c b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c
new file mode 100644
index 0000000..4e026b2
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c
@@ -0,0 +1,2452 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w16.c
+ *
+ * Routines for 16-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w16.h"
+
+#define AB2(ip, am1 ,am2, b, t1, t2) {\
+  t1 = (b << 1) & am1;\
+  t2 = b & am2; \
+  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
+  b = (t1 ^ (t2 & ip));}
+
+#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, m2); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf("  %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
+
+#define GF_FIRST_BIT (1 << 15)
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
+
+static
+inline
+gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+gf_val_32_t gf_w16_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+void
+gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      *d16 ^= gf->multiply.w32(gf, val, *s16);
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      *d16 = gf->multiply.w32(gf, val, *s16);
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+inline
+gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b)
+{
+  gf_val_32_t e_i, e_im1, e_ip1;
+  gf_val_32_t d_i, d_im1, d_ip1;
+  gf_val_32_t y_i, y_im1, y_ip1;
+  gf_val_32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = 16;
+  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+static
+gf_val_32_t gf_w16_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint16_t *r16, rv;
+
+  r16 = (uint16_t *) start;
+  rv = r16[index];
+  return rv;
+}
+
+static
+gf_val_32_t gf_w16_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint16_t a, b, *r16;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r16 = (uint16_t *) start;
+  if (r16 + index < (uint16_t *) rd.d_start) return r16[index];
+  if (r16 + index >= (uint16_t *) rd.d_top) return r16[index];
+  index -= (((uint16_t *) rd.d_start) - r16);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | (b << 8));
+}
+
+static
+gf_val_32_t gf_w16_split_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint16_t *r16, rv;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r16 = (uint16_t *) start;
+  if (r16 + index < (uint16_t *) rd.d_start) return r16[index];
+  if (r16 + index >= (uint16_t *) rd.d_top) return r16[index];
+  index -= (((uint16_t *) rd.d_start) - r16);
+  r8 = (uint8_t *) rd.d_start;
+  r8 += ((index & 0xfffffff0)*2);
+  r8 += (index & 0xf);
+  rv = (*r8 << 8);
+  r8 += 16;
+  rv |= *r8;
+  return rv;
+}
+
+static
+inline
+gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
+{
+  return gf_bitmatrix_inverse(b, 16, ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
+   include it for completeness.  It does have the feature that it requires no
+   extra memory.  
+ */
+
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_si128 shifts the result to the right by 2 bytes. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+  return rv;
+}
+
+
+static
+inline
+ gf_val_32_t
+gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t product, i, pp, a, b;
+  gf_internal_t *h;
+
+  a = a16;
+  b = b16;
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (1 << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+static 
+int gf_w16_shift_init(gf_t *gf)
+{
+  gf->multiply.w32 = gf_w16_shift_multiply;
+  return 1;
+}
+
+static 
+int gf_w16_cfm_init(gf_t *gf)
+{
+#if defined(INTEL_SSE4_PCLMUL)
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /*Ben: Determining how many reductions to do */
+  
+  if ((0xfe00 & h->prim_poly) == 0) {
+    gf->multiply.w32 = gf_w16_clm_multiply_2;
+    gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2;
+  } else if((0xf000 & h->prim_poly) == 0) {
+    gf->multiply.w32 = gf_w16_clm_multiply_3;
+    gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3;
+  } else if ((0xe000 & h->prim_poly) == 0) {
+    gf->multiply.w32 = gf_w16_clm_multiply_4;
+    gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4;
+  } else {
+    return 0;
+  } 
+  return 1;
+#endif
+
+  return 0;
+}
+
+/* KMG: GF_MULT_LOGTABLE: */
+
+static
+void
+gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t *s16, *d16;
+  int lv;
+  struct gf_w16_logtable_data *ltd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  lv = ltd->log_tbl[val];
+
+  if (xor) {
+    while (d16 < (uint16_t *) rd.d_top) {
+      *d16 ^= (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]);
+      d16++;
+      s16++;
+    }
+  } else {
+    while (d16 < (uint16_t *) rd.d_top) {
+      *d16 = (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]);
+      d16++;
+      s16++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w16_logtable_data *ltd;
+
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]];
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int log_sum = 0;
+  struct gf_w16_logtable_data *ltd;
+
+  if (a == 0 || b == 0) return 0;
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+
+  log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b];
+  return (ltd->d_antilog[log_sum]);
+}
+
+static
+gf_val_32_t
+gf_w16_log_inverse(gf_t *gf, gf_val_32_t a)
+{
+  struct gf_w16_logtable_data *ltd;
+
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ltd->inv_tbl[a]);
+}
+
+static
+int gf_w16_log_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_logtable_data *ltd;
+  int i, b;
+  int check = 0;
+
+  h = (gf_internal_t *) gf->scratch;
+  ltd = h->private;
+  
+  for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++)
+    ltd->log_tbl[i] = 0;
+  ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE;
+
+  b = 1;
+  for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
+      if (ltd->log_tbl[b] != 0) check = 1;
+      ltd->log_tbl[b] = i;
+      ltd->antilog_tbl[i] = b;
+      ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b;
+      b <<= 1;
+      if (b & GF_FIELD_SIZE) {
+          b = b ^ h->prim_poly;
+      }
+  }
+
+  /* If you can't construct the log table, there's a problem.  This code is used for
+     some other implementations (e.g. in SPLIT), so if the log table doesn't work in 
+     that instance, use CARRY_FREE / SHIFT instead. */
+
+  if (check) {
+    if (h->mult_type != GF_MULT_LOG_TABLE) {
+
+#if defined(INTEL_SSE4_PCLMUL)
+      return gf_w16_cfm_init(gf);
+#endif
+      return gf_w16_shift_init(gf);
+    } else {
+      _gf_errno = GF_E_LOGPOLY;
+      return 0;
+    }
+  }
+
+  ltd->inv_tbl[0] = 0;  /* Not really, but we need to fill it with something  */
+  ltd->inv_tbl[1] = 1;
+  for (i = 2; i < GF_FIELD_SIZE; i++) {
+    ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]];
+  }
+
+  gf->inverse.w32 = gf_w16_log_inverse;
+  gf->divide.w32 = gf_w16_log_divide;
+  gf->multiply.w32 = gf_w16_log_multiply;
+  gf->multiply_region.w32 = gf_w16_log_multiply_region;
+
+  return 1;
+}
+
+/* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions.
+*/
+
+
+/* Ben: Does alternate mapping multiplication using a split table in the
+ lazy method without sse instructions*/
+
+static 
+void
+gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t i, j, c, prod;
+  uint8_t *s8, *d8, *top;
+  uint16_t table[4][16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);    
+
+  /*Ben: Constructs lazy multiplication table*/
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      table[i][j] = gf->multiply.w32(gf, c, val);
+    }
+  }
+
+  /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */
+  
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+
+
+  while (d8 < top) {
+    
+    /*Ben: Multiplies across 16 two byte quantities using alternate mapping 
+       high bits are on the left, low bits are on the right. */
+  
+    for (j=0;j<16;j++) {
+    
+      /*Ben: If the xor flag is set, the product should include what is in dest */
+      prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0;
+
+      /*Ben: xors all 4 table lookups into the product variable*/
+      
+      prod ^= ((table[0][*(s8+16)&0xf]) ^
+          (table[1][(*(s8+16)&0xf0)>>4]) ^
+          (table[2][*(s8)&0xf]) ^
+          (table[3][(*(s8)&0xf0)>>4]));
+
+      /*Ben: Stores product in the destination and moves on*/
+      
+      *d8 = (uint8_t)(prod >> 8);
+      *(d8+16) = (uint8_t)(prod & 0x00ff);
+      s8++;
+      d8++;
+    }
+    s8+=16;
+    d8+=16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+  void
+gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t i, j, a, c, prod;
+  uint16_t *s16, *d16, *top;
+  uint16_t table[4][16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);    
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      table[i][j] = gf->multiply.w32(gf, c, val);
+    }
+  }
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+  top = (uint16_t *) rd.d_top;
+
+  while (d16 < top) {
+    a = *s16;
+    prod = (xor) ? *d16 : 0;
+    for (i = 0; i < 4; i++) {
+      prod ^= table[i][a&0xf];
+      a >>= 4;
+    }
+    *d16 = prod;
+    s16++;
+    d16++;
+  }
+}
+
+static
+void
+gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t j, k, v, a, prod, *s64, *d64, *top64;
+  gf_internal_t *h;
+  uint64_t htable[256], ltable[256];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+  
+  h = (gf_internal_t *) gf->scratch;
+
+  v = val;
+  ltable[0] = 0;
+  for (j = 1; j < 256; j <<= 1) {
+    for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]);
+    v = GF_MULTBY_TWO(v);
+  }
+  htable[0] = 0;
+  for (j = 1; j < 256; j <<= 1) {
+    for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]);
+    v = GF_MULTBY_TWO(v);
+  }
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top64 = (uint64_t *) rd.d_top;
+  
+/* Does Unrolling Matter?  -- Doesn't seem to.
+  while (d64 != top64) {
+    a = *s64;
+
+    prod = htable[a >> 56];
+    a <<= 8;
+    prod ^= ltable[a >> 56];
+    a <<= 8;
+    prod <<= 16;
+
+    prod ^= htable[a >> 56];
+    a <<= 8;
+    prod ^= ltable[a >> 56];
+    a <<= 8;
+    prod <<= 16;
+
+    prod ^= htable[a >> 56];
+    a <<= 8;
+    prod ^= ltable[a >> 56];
+    a <<= 8;
+    prod <<= 16;
+
+    prod ^= htable[a >> 56];
+    a <<= 8;
+    prod ^= ltable[a >> 56];
+    prod ^= ((xor) ? *d64 : 0); 
+    *d64 = prod;
+    s64++;
+    d64++;
+  }
+*/
+  
+  while (d64 != top64) {
+    a = *s64;
+
+    prod = 0;
+    for (j = 0; j < 4; j++) {
+      prod <<= 16;
+      prod ^= htable[a >> 56];
+      a <<= 8;
+      prod ^= ltable[a >> 56];
+      a <<= 8;
+    }
+
+    //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better.
+   
+    prod ^= ((xor) ? *d64 : 0); 
+    *d64 = prod;
+    s64++;
+    d64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static void
+gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t c;
+  gf_internal_t *h;
+  struct gf_w16_lazytable_data *ltd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+  ltd = (struct gf_w16_lazytable_data *) h->private;
+
+  ltd->lazytable[0] = 0;
+
+  /*
+  a = val;
+  c = 1;
+  pp = h->prim_poly;
+
+  do {
+    ltd->lazytable[c] = a;
+    c <<= 1;
+    if (c & (1 << GF_FIELD_WIDTH)) c ^= pp;
+    a <<= 1;
+    if (a & (1 << GF_FIELD_WIDTH)) a ^= pp;
+  } while (c != 1);
+  */
+
+  for (c = 1; c < GF_FIELD_SIZE; c++) {
+    ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val);
+  }
+   
+  gf_two_byte_region_table_multiply(&rd, ltd->lazytable);
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSSE3
+  uint64_t i, j, *s64, *d64, *top64;;
+  uint64_t c, prod;
+  uint8_t low[4][16];
+  uint8_t high[4][16];
+  gf_region_data rd;
+
+  __m128i  mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      prod = gf->multiply.w32(gf, c, val);
+      low[i][j] = (prod & 0xff);
+      high[i][j] = (prod >> 8);
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    tlow[i] = _mm_loadu_si128((__m128i *)low[i]);
+    thigh[i] = _mm_loadu_si128((__m128i *)high[i]);
+  }
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top64 = (uint64_t *) rd.d_top;
+
+  mask = _mm_set1_epi8 (0x0f);
+  lmask = _mm_set1_epi16 (0xff);
+
+  if (xor) {
+    while (d64 != top64) {
+      
+      ta = _mm_load_si128((__m128i *) s64);
+      tb = _mm_load_si128((__m128i *) (s64+2));
+
+      tta = _mm_srli_epi16(ta, 8);
+      ttb = _mm_srli_epi16(tb, 8);
+      tpl = _mm_and_si128(tb, lmask);
+      tph = _mm_and_si128(ta, lmask);
+
+      tb = _mm_packus_epi16(tpl, tph);
+      ta = _mm_packus_epi16(ttb, tta);
+
+      ti = _mm_and_si128 (mask, tb);
+      tph = _mm_shuffle_epi8 (thigh[0], ti);
+      tpl = _mm_shuffle_epi8 (tlow[0], ti);
+  
+      tb = _mm_srli_epi16(tb, 4);
+      ti = _mm_and_si128 (mask, tb);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
+
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
+  
+      ta = _mm_srli_epi16(ta, 4);
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
+
+      ta = _mm_unpackhi_epi8(tpl, tph);
+      tb = _mm_unpacklo_epi8(tpl, tph);
+
+      tta = _mm_load_si128((__m128i *) d64);
+      ta = _mm_xor_si128(ta, tta);
+      ttb = _mm_load_si128((__m128i *) (d64+2));
+      tb = _mm_xor_si128(tb, ttb); 
+      _mm_store_si128 ((__m128i *)d64, ta);
+      _mm_store_si128 ((__m128i *)(d64+2), tb);
+
+      d64 += 4;
+      s64 += 4;
+      
+    }
+  } else {
+    while (d64 != top64) {
+      
+      ta = _mm_load_si128((__m128i *) s64);
+      tb = _mm_load_si128((__m128i *) (s64+2));
+
+      tta = _mm_srli_epi16(ta, 8);
+      ttb = _mm_srli_epi16(tb, 8);
+      tpl = _mm_and_si128(tb, lmask);
+      tph = _mm_and_si128(ta, lmask);
+
+      tb = _mm_packus_epi16(tpl, tph);
+      ta = _mm_packus_epi16(ttb, tta);
+
+      ti = _mm_and_si128 (mask, tb);
+      tph = _mm_shuffle_epi8 (thigh[0], ti);
+      tpl = _mm_shuffle_epi8 (tlow[0], ti);
+  
+      tb = _mm_srli_epi16(tb, 4);
+      ti = _mm_and_si128 (mask, tb);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
+
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
+  
+      ta = _mm_srli_epi16(ta, 4);
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
+
+      ta = _mm_unpackhi_epi8(tpl, tph);
+      tb = _mm_unpacklo_epi8(tpl, tph);
+
+      _mm_store_si128 ((__m128i *)d64, ta);
+      _mm_store_si128 ((__m128i *)(d64+2), tb);
+
+      d64 += 4;
+      s64 += 4;
+    }
+  }
+
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSSE3
+  uint64_t i, j, *s64, *d64, *top64;;
+  uint64_t c, prod;
+  uint8_t low[4][16];
+  uint8_t high[4][16];
+  gf_region_data rd;
+  __m128i  mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4];
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      prod = gf->multiply.w32(gf, c, val);
+      low[i][j] = (prod & 0xff);
+      high[i][j] = (prod >> 8);
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    tlow[i] = _mm_loadu_si128((__m128i *)low[i]);
+    thigh[i] = _mm_loadu_si128((__m128i *)high[i]);
+  }
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top64 = (uint64_t *) rd.d_top;
+
+  mask = _mm_set1_epi8 (0x0f);
+
+  if (xor) {
+    while (d64 != top64) {
+
+      ta = _mm_load_si128((__m128i *) s64);
+      tb = _mm_load_si128((__m128i *) (s64+2));
+
+      ti = _mm_and_si128 (mask, tb);
+      tph = _mm_shuffle_epi8 (thigh[0], ti);
+      tpl = _mm_shuffle_epi8 (tlow[0], ti);
+  
+      tb = _mm_srli_epi16(tb, 4);
+      ti = _mm_and_si128 (mask, tb);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
+
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
+  
+      ta = _mm_srli_epi16(ta, 4);
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
+
+      ta = _mm_load_si128((__m128i *) d64);
+      tph = _mm_xor_si128(tph, ta);
+      _mm_store_si128 ((__m128i *)d64, tph);
+      tb = _mm_load_si128((__m128i *) (d64+2));
+      tpl = _mm_xor_si128(tpl, tb);
+      _mm_store_si128 ((__m128i *)(d64+2), tpl);
+
+      d64 += 4;
+      s64 += 4;
+    }
+  } else {
+    while (d64 != top64) {
+
+      ta = _mm_load_si128((__m128i *) s64);
+      tb = _mm_load_si128((__m128i *) (s64+2));
+
+      ti = _mm_and_si128 (mask, tb);
+      tph = _mm_shuffle_epi8 (thigh[0], ti);
+      tpl = _mm_shuffle_epi8 (tlow[0], ti);
+  
+      tb = _mm_srli_epi16(tb, 4);
+      ti = _mm_and_si128 (mask, tb);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
+
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
+  
+      ta = _mm_srli_epi16(ta, 4);
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
+
+      _mm_store_si128 ((__m128i *)d64, tph);
+      _mm_store_si128 ((__m128i *)(d64+2), tpl);
+
+      d64 += 4;
+      s64 += 4;
+      
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+
+#endif
+}
+
+uint32_t 
+gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t alow, blow;
+  struct gf_w16_split_8_8_data *d8;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  d8 = (struct gf_w16_split_8_8_data *) h->private;
+
+  alow = a & 0xff;
+  blow = b & 0xff;
+  a >>= 8;
+  b >>= 8;
+
+  return d8->tables[0][alow][blow] ^
+         d8->tables[1][alow][b] ^
+         d8->tables[1][a][blow] ^
+         d8->tables[2][a][b];
+}
+
+static 
+int gf_w16_split_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_split_8_8_data *d8;
+  int i, j, exp, issse3;
+  int isneon = 0;
+  uint32_t p, basep, tmp;
+
+  h = (gf_internal_t *) gf->scratch;
+
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#else
+  issse3 = 0;
+#endif
+#ifdef ARM_NEON
+  isneon = 1;
+#endif
+
+  if (h->arg1 == 8 && h->arg2 == 8) {
+    d8 = (struct gf_w16_split_8_8_data *) h->private;
+    basep = 1;
+    for (exp = 0; exp < 3; exp++) {
+      for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
+      for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
+      d8->tables[exp][1][1] = basep;
+      for (i = 2; i < 256; i++) {
+        if (i&1) {
+          p = d8->tables[exp][i^1][1];
+          d8->tables[exp][i][1] = p ^ basep;
+        } else {
+          p = d8->tables[exp][i>>1][1];
+          d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
+        }
+      }
+      for (i = 1; i < 256; i++) {
+        p = d8->tables[exp][i][1];
+        for (j = 1; j < 256; j++) {
+          if (j&1) {
+            d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
+          } else {
+            tmp = d8->tables[exp][i][j>>1];
+            d8->tables[exp][i][j] = GF_MULTBY_TWO(tmp);
+          }
+        }
+      }
+      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
+    }
+    gf->multiply.w32 = gf_w16_split_8_8_multiply;
+    gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+    return 1;
+
+  }
+
+  /* We'll be using LOG for multiplication, unless the pp isn't primitive.
+     In that case, we'll be using SHIFT. */
+
+  gf_w16_log_init(gf);
+
+  /* Defaults */
+
+  if (issse3) {
+    gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
+  } else if (isneon) {
+#ifdef ARM_NEON
+    gf_w16_neon_split_init(gf);
+#endif
+  } else {
+    gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+  }
+
+
+  if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
+    gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+
+  } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
+    if (issse3 || isneon) {
+      if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
+      else if(h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
+      else if(h->region_type & GF_REGION_ALTMAP && issse3)
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
+    } else {
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+      else if(h->region_type & GF_REGION_ALTMAP)
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
+    }
+  }
+
+  return 1;
+}
+
+static 
+int gf_w16_table_init(gf_t *gf)
+{
+  gf_w16_log_init(gf);
+
+  gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; 
+  return 1;
+}
+
+static
+void
+gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t lv;
+  int i;
+  uint16_t *s16, *d16, *top16;
+  struct gf_w16_zero_logtable_data *ltd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private;
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+  top16 = (uint16_t *) rd.d_top;
+  bytes = top16 - d16;
+
+  lv = ltd->log_tbl[val];
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d16[i] ^= (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]);
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d16[i] = (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]);
+    }
+  }
+
+  /* This isn't necessary. */
+  
+  gf_do_final_region_alignment(&rd);
+}
+
+/* Here -- double-check Kevin */
+
+static
+inline
+gf_val_32_t
+gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w16_zero_logtable_data *ltd;
+
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int log_sum = 0;
+  struct gf_w16_zero_logtable_data *ltd;
+
+  if (a == 0 || b == 0) return 0;
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+
+  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
+  return (ltd->antilog_tbl[log_sum]);
+}
+
+static
+gf_val_32_t
+gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a)
+{
+  struct gf_w16_zero_logtable_data *ltd;
+
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ltd->inv_tbl[a]);
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  
+  prod = 0;
+  pmask = 0x8000;
+  amask = 0x8000;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x8000;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+void 
+gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
+  gf_region_data rd;
+  struct gf_w16_bytwo_data *btd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x8000;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else { 
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x8000;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define BYTWO_P_ONESTEP {\
+      SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+      t1 = _mm_and_si128(v, one); \
+      t1 = _mm_sub_epi16(t1, one); \
+      t1 = _mm_and_si128(t1, ta); \
+      prod = _mm_xor_si128(prod, t1); \
+      v = _mm_srli_epi64(v, 1); }
+
+#ifdef INTEL_SSE2
+static
+void 
+gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint32_t vrev;
+  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
+  struct gf_w16_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  vrev = 0;
+  for (i = 0; i < 16; i++) {
+    vrev <<= 1;
+    if (!(val & (1 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi16(btd->prim_poly&0xffff);
+  m1 = _mm_set1_epi16((btd->mask1)&0xffff);
+  m2 = _mm_set1_epi16((btd->mask2)&0xffff);
+  one = _mm_set1_epi16(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi16(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi16(btd->prim_poly&0xffff);
+  m1 = _mm_set1_epi16((btd->mask1)&0xffff);
+  m2 = _mm_set1_epi16((btd->mask2)&0xffff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi16(btd->prim_poly&0xffff);
+  m1 = _mm_set1_epi16((btd->mask1)&0xffff);
+  m2 = _mm_set1_epi16((btd->mask2)&0xffff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+
+#ifdef INTEL_SSE2
+static
+void 
+gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int itb;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_w16_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  if (val == 2) {
+    if (xor) {
+      gf_w16_bytwo_b_sse_region_2_xor(&rd, btd);
+    } else {
+      gf_w16_bytwo_b_sse_region_2_noxor(&rd, btd);
+    }
+    gf_do_final_region_alignment(&rd);
+    return;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi16(btd->prim_poly&0xffff);
+  m1 = _mm_set1_epi16((btd->mask1)&0xffff);
+  m2 = _mm_set1_epi16((btd->mask2)&0xffff);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
+    itb = val;
+    while (1) {
+      if (itb & 1) vb = _mm_xor_si128(vb, va);
+      itb >>= 1;
+      if (itb == 0) break;
+      SSE_AB2(pp, m1, m2, va, t1, t2);
+    }
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+void 
+gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
+  struct gf_w16_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  switch (val) {
+  case 2:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 3:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 4:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 5:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  default:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = *d64 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = 0 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+int gf_w16_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  uint64_t ip, m1, m2;
+  struct gf_w16_bytwo_data *btd;
+
+  h = (gf_internal_t *) gf->scratch;
+  btd = (struct gf_w16_bytwo_data *) (h->private);
+  ip = h->prim_poly & 0xffff;
+  m1 = 0xfffe;
+  m2 = 0x8000;
+  btd->prim_poly = 0;
+  btd->mask1 = 0;
+  btd->mask2 = 0;
+
+  while (ip != 0) {
+    btd->prim_poly |= ip;
+    btd->mask1 |= m1;
+    btd->mask2 |= m2;
+    ip <<= GF_FIELD_WIDTH;
+    m1 <<= GF_FIELD_WIDTH;
+    m2 <<= GF_FIELD_WIDTH;
+  }
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    gf->multiply.w32 = gf_w16_bytwo_p_multiply;
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
+    #else
+      gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #endif
+  } else {
+    gf->multiply.w32 = gf_w16_bytwo_b_multiply;
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
+    #else
+      gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #endif
+  }
+
+  return 1;
+}
+
+static
+int gf_w16_log_zero_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_zero_logtable_data *ltd;
+  int i, b;
+
+  h = (gf_internal_t *) gf->scratch;
+  ltd = h->private;
+
+  ltd->log_tbl[0] = (-GF_MULT_GROUP_SIZE) + 1;
+
+  bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl));
+
+  ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_FIELD_SIZE * 2]);
+
+  b = 1;
+  for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
+      ltd->log_tbl[b] = (uint16_t)i;
+      ltd->antilog_tbl[i] = (uint16_t)b;
+      ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = (uint16_t)b;
+      b <<= 1;
+      if (b & GF_FIELD_SIZE) {
+          b = b ^ h->prim_poly;
+      }
+  }
+  ltd->inv_tbl[0] = 0;  /* Not really, but we need to fill it with something  */
+  ltd->inv_tbl[1] = 1;
+  for (i = 2; i < GF_FIELD_SIZE; i++) {
+    ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]];
+  }
+
+  gf->inverse.w32 = gf_w16_log_zero_inverse;
+  gf->divide.w32 = gf_w16_log_zero_divide;
+  gf->multiply.w32 = gf_w16_log_zero_multiply;
+  gf->multiply_region.w32 = gf_w16_log_zero_multiply_region;
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = b & 0x00ff;
+  uint8_t b1 = (b & 0xff00) >> 8;
+  uint8_t a0 = a & 0x00ff;
+  uint8_t a1 = (a & 0xff00) >> 8;
+  uint8_t a1b1;
+  uint16_t rv;
+
+  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+  rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+  return rv;
+}
+
+static
+gf_val_32_t
+gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint8_t b0 = b & 0x00ff;
+  uint8_t b1 = (b & 0xff00) >> 8;
+  uint8_t a0 = a & 0x00ff;
+  uint8_t a1 = (a & 0xff00) >> 8;
+  uint8_t a1b1, *mt;
+  uint16_t rv;
+  struct gf_w16_composite_data *cd;
+
+  cd = (struct gf_w16_composite_data *) h->private;
+  mt = cd->mult_table;
+
+  a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+
+  rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+  return rv;
+}
+
+/*
+ * Composite field division trick (explained in 2007 tech report)
+ *
+ * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
+ *
+ * let c = b^-1
+ *
+ * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
+ *
+ * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
+ *
+ * let d = b1c1 and d+1 = b0c0
+ *
+ * solve s*b1c1+b1c0+b0c1 = 0
+ *
+ * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
+ *
+ * c0 = (d+1)b0^-1
+ * c1 = d*b1^-1
+ *
+ * a / b = a * c
+ */
+
+static
+gf_val_32_t
+gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t a0 = a & 0x00ff;
+  uint8_t a1 = (a & 0xff00) >> 8;
+  uint8_t c0, c1, d, tmp;
+  uint16_t c;
+  uint8_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w32(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    a0inv = base_gf->inverse.w32(base_gf, a0);
+
+    d = base_gf->multiply.w32(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w32(base_gf, tmp);
+
+    d = base_gf->multiply.w32(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w32(base_gf, d, a1inv);
+  }
+
+  c = c0 | (c1 << 8);
+
+  return c;
+}
+
+static
+void
+gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = val & 0x00ff;
+  uint8_t b1 = (val & 0xff00) >> 8;
+  uint16_t *s16, *d16, *top;
+  uint8_t a0, a1, a1b1, *mt;
+  gf_region_data rd;
+  struct gf_w16_composite_data *cd;
+
+  cd = (struct gf_w16_composite_data *) h->private;
+  mt = cd->mult_table;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+
+  s16 = rd.s_start;
+  d16 = rd.d_start;
+  top = rd.d_top;
+
+  if (mt == NULL) {
+    if (xor) {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                    base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                    base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    } else {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                    base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                    base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    }
+  } else {
+    if (xor) {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+  
+        (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+                  ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    } else {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+  
+        (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+                  ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    }
+  }
+}
+
+static
+void
+gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t val0 = val & 0x00ff;
+  uint8_t val1 = (val & 0xff00) >> 8;
+  gf_region_data rd;
+  int sub_reg_size;
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;;
+
+  /* JSP: I want the two pointers aligned wrt each other on 16 byte 
+     boundaries.  So I'm going to make sure that the area on 
+     which the two operate is a multiple of 32. Of course, that 
+     junks up the mapping, but so be it -- that's why we have extract_word.... */
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t *)  rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+
+  base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+int gf_w16_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_w16_composite_data *cd;
+
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w16_composite_data *) h->private;
+  cd->mult_table = gf_w8_get_mult_table(h->base_gf);
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt;
+  } else {
+    gf->multiply_region.w32 = gf_w16_composite_multiply_region;
+  }
+
+  if (cd->mult_table == NULL) {
+    gf->multiply.w32 = gf_w16_composite_multiply_recursive;
+  } else {
+    gf->multiply.w32 = gf_w16_composite_multiply_inline;
+  }
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = gf_w16_composite_inverse;
+
+  return 1;
+}
+
+static
+void
+gf_w16_group_4_set_shift_tables(uint16_t *shift, uint16_t val, gf_internal_t *h)
+{
+  int i, j;
+
+  shift[0] = 0;
+  for (i = 0; i < 16; i += 2) {
+    j = (shift[i>>1] << 1);
+    if (j & (1 << 16)) j ^= h->prim_poly;
+    shift[i] = j;
+    shift[i^1] = j^val;
+  }
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint16_t p, l, ind, r, a16;
+
+  struct gf_w16_group_4_4_data *d44;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  d44 = (struct gf_w16_group_4_4_data *) h->private;
+  gf_w16_group_4_set_shift_tables(d44->shift, b, h);
+
+  a16 = a;
+  ind = a16 >> 12;
+  a16 <<= 4;
+  p = d44->shift[ind];
+  r = p & 0xfff;
+  l = p >> 12;
+  ind = a16 >> 12;
+  a16 <<= 4;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+  r = p & 0xfff;
+  l = p >> 12;
+  ind = a16 >> 12;
+  a16 <<= 4;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+  r = p & 0xfff;
+  l = p >> 12;
+  ind = a16 >> 12;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+  return p;
+}
+
+static
+void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t p, l, ind, r, a16, p16;
+  struct gf_w16_group_4_4_data *d44;
+  gf_region_data rd;
+  uint16_t *s16, *d16, *top;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  d44 = (struct gf_w16_group_4_4_data *) h->private;
+  gf_w16_group_4_set_shift_tables(d44->shift, val, h);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+  top = (uint16_t *) rd.d_top;
+
+  while (d16 < top) {
+    a16 = *s16;
+    p16 = (xor) ? *d16 : 0;
+    ind = a16 >> 12;
+    a16 <<= 4;
+    p = d44->shift[ind];
+    r = p & 0xfff;
+    l = p >> 12;
+    ind = a16 >> 12;
+    a16 <<= 4;
+    p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+    r = p & 0xfff;
+    l = p >> 12;
+    ind = a16 >> 12;
+    a16 <<= 4;
+    p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+    r = p & 0xfff;
+    l = p >> 12;
+    ind = a16 >> 12;
+    p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+    p ^= p16;
+    *d16 = p;
+    d16++;
+    s16++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+int gf_w16_group_init(gf_t *gf)
+{
+  int i, j, p;
+  struct gf_w16_group_4_4_data *d44;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  d44 = (struct gf_w16_group_4_4_data *) h->private;
+  d44->reduce[0] = 0;
+  for (i = 0; i < 16; i++) {
+    p = 0;
+    for (j = 0; j < 4; j++) {
+      if (i & (1 << j)) p ^= (h->prim_poly << j);
+    }
+    d44->reduce[p>>16] = (p&0xffff);
+  }
+
+  gf->multiply.w32 = gf_w16_group_4_4_multiply;
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = NULL;
+  gf->multiply_region.w32 = gf_w16_group_4_4_region_multiply;
+
+  return 1;
+}
+
+int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  switch(mult_type)
+  {
+    case GF_MULT_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64;
+      break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data);
+      break;
+    case GF_MULT_LOG_ZERO:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64;
+      break;
+    case GF_MULT_LOG_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+      break;
+    case GF_MULT_DEFAULT:
+    case GF_MULT_SPLIT_TABLE: 
+      if (arg1 == 8 && arg2 == 8) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64;
+      } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+      } else if (mult_type == GF_MULT_DEFAULT || 
+                 (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+      }
+      return 0;
+      break;
+    case GF_MULT_GROUP:     
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64;
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_COMPOSITE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64;
+      break;
+
+    default:
+      return 0;
+   }
+   return 0;
+}
+
+int gf_w16_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0;
+    } else { 
+
+     /* Allen: use the following primitive polynomial to make 
+               carryless multiply work more efficiently for GF(2^16).
+
+        h->prim_poly = 0x1002d;
+
+        The following is the traditional primitive polynomial for GF(2^16) */
+
+      h->prim_poly = 0x1100b;
+    } 
+  }
+
+  if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16);
+
+  gf->multiply.w32 = NULL;
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = NULL;
+  gf->multiply_region.w32 = NULL;
+
+  switch(h->mult_type) {
+    case GF_MULT_LOG_ZERO:    if (gf_w16_log_zero_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE:   if (gf_w16_log_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break;
+    case GF_MULT_TABLE:       if (gf_w16_table_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:  if (gf_w16_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:       if (gf_w16_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:   if (gf_w16_composite_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p: 
+    case GF_MULT_BYTWO_b:     if (gf_w16_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:       if (gf_w16_group_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    gf->divide.w32 = gf_w16_divide_from_inverse;
+    gf->inverse.w32 = gf_w16_euclid;
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    gf->divide.w32 = gf_w16_divide_from_inverse;
+    gf->inverse.w32 = gf_w16_matrix;
+  }
+
+  if (gf->divide.w32 == NULL) {
+    gf->divide.w32 = gf_w16_divide_from_inverse;
+    if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid;
+  }
+
+  if (gf->inverse.w32 == NULL)  gf->inverse.w32 = gf_w16_inverse_from_divide;
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      gf->extract_word.w32 = gf_w16_composite_extract_word;
+    } else {
+      gf->extract_word.w32 = gf_w16_split_extract_word;
+    }
+  } else if (h->region_type == GF_REGION_CAUCHY) {
+    gf->multiply_region.w32 = gf_wgen_cauchy_region;
+    gf->extract_word.w32 = gf_wgen_extract_word;
+  } else {
+    gf->extract_word.w32 = gf_w16_extract_word;
+  }
+  if (gf->multiply_region.w32 == NULL) {
+    gf->multiply_region.w32 = gf_w16_multiply_region_from_single;
+  }
+  return 1;
+}
+
+/* Inline setup functions */
+
+uint16_t *gf_w16_get_log_table(gf_t *gf)
+{
+  struct gf_w16_logtable_data *ltd;
+
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint16_t *) ltd->log_tbl;
+  }
+  return NULL;
+}
+
+uint16_t *gf_w16_get_mult_alog_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_logtable_data *ltd;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_w16_logtable_data *) h->private;
+    return (uint16_t *) ltd->antilog_tbl;
+  }
+  return NULL;
+}
+
+uint16_t *gf_w16_get_div_alog_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_logtable_data *ltd;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_w16_logtable_data *) h->private;
+    return (uint16_t *) ltd->d_antilog;
+  }
+  return NULL;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w32.c b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c
new file mode 100644
index 0000000..854a6e4
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c
@@ -0,0 +1,2823 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w32.c
+ *
+ * Routines for 32-bit Galois fields
+ */
+
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w32.h"
+
+#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+#define AB2(ip, am1 ,am2, b, t1, t2) {\
+  t1 = (b << 1) & am1;\
+  t2 = b & am2; \
+  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
+  b = (t1 ^ (t2 & ip));}
+
+#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, m2); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+static
+inline
+uint32_t gf_w32_inverse_from_divide (gf_t *gf, uint32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+uint32_t gf_w32_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+void
+gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int 
+xor)
+{
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+   
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      d32[i] ^= gf->multiply.w32(gf, val, s32[i]);
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      d32[i] = gf->multiply.w32(gf, val, s32[i]);
+    } 
+  }
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+static 
+void
+gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+   
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL) 
+
+static 
+void
+gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static 
+void
+gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+}
+#endif
+
+static
+inline
+uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
+{
+  uint32_t e_i, e_im1, e_ip1;
+  uint32_t d_i, d_im1, d_ip1;
+  uint32_t y_i, y_im1, y_ip1;
+  uint32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; 
+  e_i = b;
+  d_im1 = 32;
+  for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      d_ip1--;
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+static
+gf_val_32_t gf_w32_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint32_t *r32, rv;
+
+  r32 = (uint32_t *) start;
+  rv = r32[index];
+  return rv;
+}
+
+static
+gf_val_32_t gf_w32_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint32_t a, b, *r32;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r32 = (uint32_t *) start;
+  if (r32 + index < (uint32_t *) rd.d_start) return r32[index];
+  if (r32 + index >= (uint32_t *) rd.d_top) return r32[index];
+  index -= (((uint32_t *) rd.d_start) - r32);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | (b << 16));
+}
+
+static
+gf_val_32_t gf_w32_split_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int i;
+  uint32_t *r32, rv;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
+  r32 = (uint32_t *) start;
+  if (r32 + index < (uint32_t *) rd.d_start) return r32[index];
+  if (r32 + index >= (uint32_t *) rd.d_top) return r32[index];
+  index -= (((uint32_t *) rd.d_start) - r32);
+  r8 = (uint8_t *) rd.d_start;
+  r8 += ((index & 0xfffffff0)*4);
+  r8 += (index & 0xf);
+  r8 += 48;
+  rv =0;
+  for (i = 0; i < 4; i++) {
+    rv <<= 8;
+    rv |= *r8;
+    r8 -= 16;
+  }
+  return rv;
+}
+
+
+static
+inline
+uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
+{
+  return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
+   include it for completeness.  It does have the feature that it requires no
+   extra memory.  
+*/
+
+static
+inline
+gf_val_32_t
+gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         w;
+  __m128i         g, q;
+  gf_internal_t * h = gf->scratch;
+  uint64_t        g_star, q_plus;
+
+  q_plus = *(uint64_t *) h->private;
+  g_star = *((uint64_t *) h->private + 1);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+  g = _mm_insert_epi64 (a, g_star, 0);
+  q = _mm_insert_epi64 (a, q_plus, 0);
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+  w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
+  w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+static 
+void
+gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         w;
+  __m128i         g, q;
+  gf_internal_t * h = gf->scratch;
+  uint64_t        g_star, q_plus;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  q_plus = *(uint64_t *) h->private;
+  g_star = *((uint64_t *) h->private + 1);
+
+  g = _mm_insert_epi64 (a, g_star, 0);
+  q = _mm_insert_epi64 (a, q_plus, 0);
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
+      w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
+      w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+}
+#endif
+
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1 
+
+   _mm_clmulepi64_si128 is the carryless multiply operation. Here
+   _mm_srli_si128 shifts the result to the right by 4 bytes. This allows
+   us to multiply the prim_poly by the leading bits of the result. We
+   then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+
+static
+inline
+uint32_t
+gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
+{
+  uint64_t product, i, pp, a, b, one;
+  gf_internal_t *h;
+
+  a = a32;
+  b = b32;
+  h = (gf_internal_t *) gf->scratch;
+  one = 1;
+  pp = h->prim_poly | (one << 32);
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (one << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+  static 
+int gf_w32_cfmgk_init(gf_t *gf)
+{
+  gf->inverse.w32 = gf_w32_euclid;
+  gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+  
+#if defined(INTEL_SSE4_PCLMUL)
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf->multiply.w32 = gf_w32_cfmgk_multiply;
+  gf->multiply_region.w32 = gf_w32_cfmgk_multiply_region_from_single;
+
+  uint64_t *q_plus = (uint64_t *) h->private;
+  uint64_t *g_star = (uint64_t *) h->private + 1;
+
+  uint64_t tmp = h->prim_poly << 32;
+  *q_plus = 1ULL << 32;
+
+  int i;
+  for(i = 63; i >= 32; i--)
+    if((1ULL << i) & tmp)
+    {
+      *q_plus |= 1ULL << (i-32);
+      tmp ^= h->prim_poly << (i-32);
+    }
+
+  *g_star = h->prim_poly & ((1ULL << 32) - 1);
+
+  return 1;
+#endif
+
+  return 0;
+}
+
+  static 
+int gf_w32_cfm_init(gf_t *gf)
+{
+  gf->inverse.w32 = gf_w32_euclid;
+  gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+  
+  /*Ben: We also check to see if the prim poly will work for pclmul */
+  /*Ben: Check to see how many reduction steps it will take*/
+
+#if defined(INTEL_SSE4_PCLMUL)
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if ((0xfffe0000 & h->prim_poly) == 0){ 
+    gf->multiply.w32 = gf_w32_clm_multiply_2;
+    gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2;
+  }else if ((0xffc00000 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w32_clm_multiply_3;
+    gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_3;
+  }else if ((0xfe000000 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w32_clm_multiply_4;
+    gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_4;
+  } else {
+    return 0;
+  }
+  return 1;
+  #endif
+
+  return 0;
+}
+
+  static 
+int gf_w32_shift_init(gf_t *gf)
+{
+  gf->inverse.w32 = gf_w32_euclid;
+  gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+  gf->multiply.w32 = gf_w32_shift_multiply;
+  return 1;
+}
+
+static
+  void
+gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
+{
+  uint32_t i;
+  uint32_t j;
+
+  shift[0] = 0;
+
+  for (i = 1; i < ((uint32_t)1 << h->arg1); i <<= 1) {
+    for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
+    if (val & GF_FIRST_BIT) {
+      val <<= 1;
+      val ^= h->prim_poly;
+    } else {
+      val <<= 1;
+    }
+  }
+}
+
+  static
+void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int leftover, rs;
+  uint32_t p, l, ind, a32;
+  int bits_left;
+  int g_s;
+  gf_region_data rd;
+  uint32_t *s32, *d32, *top;
+  struct gf_w32_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gd = (struct gf_w32_group_data *) h->private;
+  g_s = h->arg1;
+  gf_w32_group_set_shift_tables(gd->shift, val, h);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  leftover = 32 % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  while (d32 < top) {
+    rs = 32 - leftover;
+    a32 = *s32;
+    ind = a32 >> rs;
+    a32 <<= leftover;
+    p = gd->shift[ind];
+
+    bits_left = rs;
+    rs = 32 - g_s;
+
+    while (bits_left > 0) {
+      bits_left -= g_s;
+      ind = a32 >> rs;
+      a32 <<= g_s;
+      l = p >> rs;
+      p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
+    }
+    if (xor) p ^= *d32;
+    *d32 = p;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+  static
+void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint32_t *s32, *d32, *top;
+  int i;
+  int leftover;
+  uint64_t p, l, r;
+  uint32_t a32, ind;
+  int g_s, g_r;
+  struct gf_w32_group_data *gd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+  g_r = h->arg2;
+  gd = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(gd->shift, val, h);
+
+  leftover = GF_FIELD_WIDTH % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  gd = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(gd->shift, val, h);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  while (d32 < top) {
+    a32 = *s32;
+    ind = a32 >> (GF_FIELD_WIDTH - leftover);
+    p = gd->shift[ind];
+    p <<= g_s;
+    a32 <<= leftover;
+  
+    i = (GF_FIELD_WIDTH - leftover);
+    while (i > g_s) {
+      ind = a32 >> (GF_FIELD_WIDTH-g_s);
+      p ^= gd->shift[ind];
+      a32 <<= g_s;
+      p <<= g_s;
+      i -= g_s;
+    }
+  
+    ind = a32 >> (GF_FIELD_WIDTH-g_s);
+    p ^= gd->shift[ind];
+  
+    for (i = gd->tshift ; i >= 0; i -= g_r) {
+      l = p & (gd->rmask << i);
+      r = gd->reduce[l >> (i+32)];
+      r <<= (i);
+      p ^= r;
+    }
+
+    if (xor) p ^= *d32;
+    *d32 = p;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int leftover, rs;
+  uint32_t p, l, ind, a32;
+  int bits_left;
+  int g_s;
+
+  struct gf_w32_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+
+  gd = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = 32 % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  rs = 32 - leftover;
+  a32 = a;
+  ind = a32 >> rs;
+  a32 <<= leftover;
+  p = gd->shift[ind];
+
+  bits_left = rs;
+  rs = 32 - g_s;
+
+  while (bits_left > 0) {
+    bits_left -= g_s;
+    ind = a32 >> rs;
+    a32 <<= g_s;
+    l = p >> rs;
+    p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
+  }
+  return p;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t p, l, ind, a32;
+
+  struct gf_w32_group_data *d44;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  d44 = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(d44->shift, b, h);
+
+  a32 = a;
+  ind = a32 >> 28;
+  a32 <<= 4;
+  p = d44->shift[ind];
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  return p;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int i;
+  int leftover;
+  uint64_t p, l, r;
+  uint32_t a32, ind;
+  int g_s, g_r;
+  struct gf_w32_group_data *gd;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+  g_r = h->arg2;
+  gd = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = GF_FIELD_WIDTH % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  a32 = a;
+  ind = a32 >> (GF_FIELD_WIDTH - leftover);
+  p = gd->shift[ind];
+  p <<= g_s;
+  a32 <<= leftover;
+
+  i = (GF_FIELD_WIDTH - leftover);
+  while (i > g_s) {
+    ind = a32 >> (GF_FIELD_WIDTH-g_s);
+    p ^= gd->shift[ind];
+    a32 <<= g_s;
+    p <<= g_s;
+    i -= g_s;
+  }
+
+  ind = a32 >> (GF_FIELD_WIDTH-g_s);
+  p ^= gd->shift[ind];
+
+  for (i = gd->tshift ; i >= 0; i -= g_r) {
+    l = p & (gd->rmask << i);
+    r = gd->reduce[l >> (i+32)];
+    r <<= (i);
+    p ^= r;
+  }
+  return p;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x80000000;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+
+  prod = 0;
+  pmask = 0x80000000;
+  amask = 0x80000000;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+void
+gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
+  gf_region_data rd;
+  struct gf_w32_bytwo_data *btd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x80000000;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x80000000;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define BYTWO_P_ONESTEP {\
+      SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+      t1 = _mm_and_si128(v, one); \
+      t1 = _mm_sub_epi32(t1, one); \
+      t1 = _mm_and_si128(t1, ta); \
+      prod = _mm_xor_si128(prod, t1); \
+      v = _mm_srli_epi64(v, 1); }
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint32_t vrev;
+  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
+  struct gf_w32_bytwo_data *btd;
+  gf_region_data rd;
+   
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  vrev = 0;
+  for (i = 0; i < 32; i++) {
+    vrev <<= 1;
+    if (!(val & (1 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
+  m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
+  m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
+  one = _mm_set1_epi32(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi32(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+void
+gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
+  struct gf_w32_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  switch (val) {
+  case 2:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 3:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 4:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 5:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  default:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = *d64 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = 0 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
+  m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
+  m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
+  m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
+  m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+
+#ifdef INTEL_SSE2
+static
+void 
+gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint32_t itb;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_w32_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  if (val == 2) {
+    if (xor) {
+      gf_w32_bytwo_b_sse_region_2_xor(&rd, btd);
+    } else {
+      gf_w32_bytwo_b_sse_region_2_noxor(&rd, btd);
+    }
+    gf_do_final_region_alignment(&rd);
+    return;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
+  m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
+  m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
+    itb = val;
+    while (1) {
+      if (itb & 1) vb = _mm_xor_si128(vb, va);
+      itb >>= 1;
+      if (itb == 0) break;
+      SSE_AB2(pp, m1, m2, va, t1, t2);
+    }
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+int gf_w32_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  uint64_t ip, m1, m2;
+  struct gf_w32_bytwo_data *btd;
+
+  h = (gf_internal_t *) gf->scratch;
+  btd = (struct gf_w32_bytwo_data *) (h->private);
+  ip = h->prim_poly & 0xffffffff;
+  m1 = 0xfffffffe;
+  m2 = 0x80000000;
+  btd->prim_poly = 0;
+  btd->mask1 = 0;
+  btd->mask2 = 0;
+
+  while (ip != 0) {
+    btd->prim_poly |= ip;
+    btd->mask1 |= m1;
+    btd->mask2 |= m2;
+    ip <<= GF_FIELD_WIDTH;
+    m1 <<= GF_FIELD_WIDTH;
+    m2 <<= GF_FIELD_WIDTH;
+  }
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    gf->multiply.w32 = gf_w32_bytwo_p_multiply;
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; 
+      else
+        gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; 
+    #else
+      gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; 
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #endif
+  } else {
+    gf->multiply.w32 = gf_w32_bytwo_b_multiply; 
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; 
+      else
+        gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; 
+    #else
+      gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; 
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #endif
+  }
+
+  gf->inverse.w32 = gf_w32_euclid;
+  return 1;
+}
+
+static
+inline
+uint32_t
+gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
+{
+  uint32_t product, i, j, mask, tb;
+  gf_internal_t *h;
+  struct gf_w32_split_8_8_data *d8;
+  
+  h = (gf_internal_t *) gf->scratch;
+  d8 = (struct gf_w32_split_8_8_data *) h->private;
+  product = 0;
+  mask = 0xff;
+
+  for (i = 0; i < 4; i++) {
+    tb = b32;
+    for (j = 0; j < 4; j++) {
+      product ^= d8->tables[i+j][a32&mask][tb&mask];
+      tb >>= 8;
+    }
+    a32 >>= 8;
+  }
+  return product;
+}
+
+static
+inline
+void
+gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  uint32_t *s32, *d32, *top, p, a, v;
+  struct gf_split_8_32_lazy_data *d8;
+  struct gf_w32_split_8_8_data *d88;
+  uint32_t *t[4];
+  int i, j, k, change;
+  uint32_t pp;
+  gf_region_data rd;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) {
+    d8 = (struct gf_split_8_32_lazy_data *) h->private;
+    for (i = 0; i < 4; i++) t[i] = d8->tables[i];
+    change = (val != d8->last_value);
+    if (change) d8->last_value = val;
+  } else {
+    d88 = (struct gf_w32_split_8_8_data *) h->private;
+    for (i = 0; i < 4; i++) t[i] = d88->region_tables[i];
+    change = (val != d88->last_value);
+    if (change) d88->last_value = val;
+  }
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+  
+  if (change) {
+    v = val;
+    for (i = 0; i < 4; i++) {
+      t[i][0] = 0;
+      for (j = 1; j < 256; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          t[i][k^j] = (v ^ t[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  } 
+
+  while (d32 < top) {
+    p = (xor) ? *d32 : 0;
+    a = *s32;
+    i = 0;
+    while (a != 0) {
+      v = (a & 0xff);
+      p ^= t[i][v];
+      a >>= 8;
+      i++;
+    }
+    *d32 = p;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+void
+gf_w32_split_16_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  uint32_t *s32, *d32, *top, p, a, v;
+  struct gf_split_16_32_lazy_data *d16;
+  uint32_t *t[2];
+  int i, j, k, change;
+  uint32_t pp;
+  gf_region_data rd;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  d16 = (struct gf_split_16_32_lazy_data *) h->private;
+  for (i = 0; i < 2; i++) t[i] = d16->tables[i];
+  change = (val != d16->last_value);
+  if (change) d16->last_value = val;
+
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+  
+  if (change) {
+    v = val;
+    for (i = 0; i < 2; i++) {
+      t[i][0] = 0;
+      for (j = 1; j < (1 << 16); j <<= 1) {
+        for (k = 0; k < j; k++) {
+          t[i][k^j] = (v ^ t[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  } 
+
+  while (d32 < top) {
+    p = (xor) ? *d32 : 0;
+    a = *s32;
+    i = 0;
+    while (a != 0 && i < 2) {
+      v = (a & 0xffff);
+      p ^= t[i][v];
+      a >>= 16;
+      i++;
+    }
+    *d32 = p;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_2_32_lazy_data *ld;
+  int i;
+  uint32_t pp, v, v2, s, *s32, *d32, *top;
+  gf_region_data rd;
+ 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_2_32_lazy_data *) h->private;
+  
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 16; i++) {
+      v2 = (v << 1);
+      if (v & GF_FIRST_BIT) v2 ^= pp;
+      ld->tables[i][0] = 0;
+      ld->tables[i][1] = v;
+      ld->tables[i][2] = v2;
+      ld->tables[i][3] = (v2 ^ v);
+      v = (v2 << 1);
+      if (v2 & GF_FIRST_BIT) v ^= pp;
+    }
+  }
+  ld->last_value = val;
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  while (d32 != top) {
+    v = (xor) ? *d32 : 0;
+    s = *s32;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&3];
+      s >>= 2;
+      i++;
+    }
+    *d32 = v;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifdef INTEL_SSSE3
+static
+void
+gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, tindex;
+  uint32_t pp, v, v2, *s32, *d32, *top;
+  __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2;
+  gf_region_data rd;
+ 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+  
+  v = val;
+  for (i = 0; i < 16; i++) {
+    v2 = (v << 1);
+    if (v & GF_FIRST_BIT) v2 ^= pp;
+    tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0);
+    v = (v2 << 1);
+    if (v2 & GF_FIRST_BIT) v ^= pp;
+  }
+
+  shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
+  mask1 = _mm_set1_epi8(0x3);
+  mask2 = _mm_set1_epi8(0xc);
+
+  while (d32 != top) {
+    pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128();
+    vi = _mm_load_si128((__m128i *) s32);
+ 
+    tindex = 0;
+    for (i = 0; i < 4; i++) {
+      si = _mm_shuffle_epi8(vi, shuffler);
+
+      xi = _mm_and_si128(si, mask1);
+      xi = _mm_slli_epi16(xi, 2);
+      xi = _mm_xor_si128(xi, adder);
+      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
+      tindex++;
+
+      xi = _mm_and_si128(si, mask2);
+      xi = _mm_xor_si128(xi, adder);
+      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
+      si = _mm_srli_epi16(si, 2);
+      tindex++;
+
+      xi = _mm_and_si128(si, mask2);
+      xi = _mm_xor_si128(xi, adder);
+      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
+      si = _mm_srli_epi16(si, 2);
+      tindex++;
+
+      xi = _mm_and_si128(si, mask2);
+      xi = _mm_xor_si128(xi, adder);
+      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
+      tindex++;
+      
+      vi = _mm_srli_epi32(vi, 8);
+    }
+    _mm_store_si128((__m128i *) d32, pi);
+    d32 += 4;
+    s32 += 4;
+  }
+
+  gf_do_final_region_alignment(&rd);
+
+}
+#endif
+
+static
+void
+gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_4_32_lazy_data *ld;
+  int i, j, k;
+  uint32_t pp, v, s, *s32, *d32, *top;
+  gf_region_data rd;
+ 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_4_32_lazy_data *) h->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+  
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 8; i++) {
+      ld->tables[i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  }
+  ld->last_value = val;
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  while (d32 != top) {
+    v = (xor) ? *d32 : 0;
+    s = *s32;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&0xf];
+      s >>= 4;
+      i++;
+    }
+    *d32 = v;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSSE3
+  gf_internal_t *h;
+  int i, j, k;
+  uint32_t pp, v, *s32, *d32, *top;
+  __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3;
+  struct gf_split_4_32_lazy_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+ 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+  
+  ld = (struct gf_split_4_32_lazy_data *) h->private;
+ 
+  v = val;
+  for (i = 0; i < 8; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 4; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+
+  if (xor) {
+    while (d32 != top) {
+      p0 = _mm_load_si128 ((__m128i *) d32);
+      p1 = _mm_load_si128 ((__m128i *) (d32+4));
+      p2 = _mm_load_si128 ((__m128i *) (d32+8));
+      p3 = _mm_load_si128 ((__m128i *) (d32+12));
+  
+      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
+  
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
+      
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
+  
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
+      
+      v1 = _mm_srli_epi32(v1, 4);
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
+  
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
+      
+      v2 = _mm_srli_epi32(v2, 4);
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
+  
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
+      
+      v3 = _mm_srli_epi32(v3, 4);
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
+  
+      _mm_store_si128((__m128i *) d32, p0);
+      _mm_store_si128((__m128i *) (d32+4), p1);
+      _mm_store_si128((__m128i *) (d32+8), p2);
+      _mm_store_si128((__m128i *) (d32+12), p3);
+      d32 += 16;
+    } 
+  } else {
+    while (d32 != top) {
+  
+      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
+
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_shuffle_epi8(tables[0][0], si);
+      p1 = _mm_shuffle_epi8(tables[0][1], si);
+      p2 = _mm_shuffle_epi8(tables[0][2], si);
+      p3 = _mm_shuffle_epi8(tables[0][3], si);
+      
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
+  
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
+      
+      v1 = _mm_srli_epi32(v1, 4);
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
+  
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
+      
+      v2 = _mm_srli_epi32(v2, 4);
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
+  
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
+      
+      v3 = _mm_srli_epi32(v3, 4);
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
+  
+      _mm_store_si128((__m128i *) d32, p0);
+      _mm_store_si128((__m128i *) (d32+4), p1);
+      _mm_store_si128((__m128i *) (d32+8), p2);
+      _mm_store_si128((__m128i *) (d32+12), p3);
+      d32 += 16;
+    } 
+  }
+
+  gf_do_final_region_alignment(&rd);
+
+#endif
+}
+
+
+static
+void
+gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSSE3
+  gf_internal_t *h;
+  int i, j, k;
+  uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
+  __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8;
+  __m128i tv1, tv2, tv3, tv0;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  v = val;
+  for (i = 0; i < 8; i++) {
+    tmp_table[0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        tmp_table[k^j] = (v ^ tmp_table[k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 4; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) tmp_table[k];
+        tmp_table[k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+  mask8 = _mm_set1_epi16(0xff);
+
+  if (xor) {
+    while (d32 != top) {
+      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
+  
+      p0 = _mm_srli_epi16(v0, 8);
+      p1 = _mm_srli_epi16(v1, 8);
+      p2 = _mm_srli_epi16(v2, 8);
+      p3 = _mm_srli_epi16(v3, 8);
+
+      tv0 = _mm_and_si128(v0, mask8);
+      tv1 = _mm_and_si128(v1, mask8);
+      tv2 = _mm_and_si128(v2, mask8);
+      tv3 = _mm_and_si128(v3, mask8);
+
+      v0 = _mm_packus_epi16(p1, p0);
+      v1 = _mm_packus_epi16(tv1, tv0);
+      v2 = _mm_packus_epi16(p3, p2);
+      v3 = _mm_packus_epi16(tv3, tv2);
+
+      p0 = _mm_srli_epi16(v0, 8);
+      p1 = _mm_srli_epi16(v1, 8);
+      p2 = _mm_srli_epi16(v2, 8);
+      p3 = _mm_srli_epi16(v3, 8);
+
+      tv0 = _mm_and_si128(v0, mask8);
+      tv1 = _mm_and_si128(v1, mask8);
+      tv2 = _mm_and_si128(v2, mask8);
+      tv3 = _mm_and_si128(v3, mask8);
+
+      v0 = _mm_packus_epi16(p2, p0);
+      v1 = _mm_packus_epi16(p3, p1);
+      v2 = _mm_packus_epi16(tv2, tv0);
+      v3 = _mm_packus_epi16(tv3, tv1);
+
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_shuffle_epi8(tables[6][0], si);
+      p1 = _mm_shuffle_epi8(tables[6][1], si);
+      p2 = _mm_shuffle_epi8(tables[6][2], si);
+      p3 = _mm_shuffle_epi8(tables[6][3], si);
+      
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
+  
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
+      
+      v1 = _mm_srli_epi32(v1, 4);
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
+  
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
+      
+      v2 = _mm_srli_epi32(v2, 4);
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
+  
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
+      
+      v3 = _mm_srli_epi32(v3, 4);
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
+  
+      tv0 = _mm_unpackhi_epi8(p1, p3);
+      tv1 = _mm_unpackhi_epi8(p0, p2);
+      tv2 = _mm_unpacklo_epi8(p1, p3);
+      tv3 = _mm_unpacklo_epi8(p0, p2);
+
+      p0 = _mm_unpackhi_epi8(tv1, tv0);
+      p1 = _mm_unpacklo_epi8(tv1, tv0);
+      p2 = _mm_unpackhi_epi8(tv3, tv2);
+      p3 = _mm_unpacklo_epi8(tv3, tv2);
+
+      v0 = _mm_load_si128 ((__m128i *) d32);
+      v1 = _mm_load_si128 ((__m128i *) (d32+4));
+      v2 = _mm_load_si128 ((__m128i *) (d32+8));
+      v3 = _mm_load_si128 ((__m128i *) (d32+12));
+  
+      p0 = _mm_xor_si128(p0, v0);
+      p1 = _mm_xor_si128(p1, v1);
+      p2 = _mm_xor_si128(p2, v2);
+      p3 = _mm_xor_si128(p3, v3);
+
+      _mm_store_si128((__m128i *) d32, p0);
+      _mm_store_si128((__m128i *) (d32+4), p1);
+      _mm_store_si128((__m128i *) (d32+8), p2);
+      _mm_store_si128((__m128i *) (d32+12), p3);
+      d32 += 16;
+    } 
+  } else {
+    while (d32 != top) {
+      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
+ 
+      p0 = _mm_srli_epi16(v0, 8);
+      p1 = _mm_srli_epi16(v1, 8);
+      p2 = _mm_srli_epi16(v2, 8);
+      p3 = _mm_srli_epi16(v3, 8);
+      
+      tv0 = _mm_and_si128(v0, mask8);
+      tv1 = _mm_and_si128(v1, mask8);
+      tv2 = _mm_and_si128(v2, mask8);
+      tv3 = _mm_and_si128(v3, mask8);
+      
+      v0 = _mm_packus_epi16(p1, p0);
+      v1 = _mm_packus_epi16(tv1, tv0);
+      v2 = _mm_packus_epi16(p3, p2);
+      v3 = _mm_packus_epi16(tv3, tv2);
+      
+      p0 = _mm_srli_epi16(v0, 8);
+      p1 = _mm_srli_epi16(v1, 8);
+      p2 = _mm_srli_epi16(v2, 8);
+      p3 = _mm_srli_epi16(v3, 8);
+     
+      tv0 = _mm_and_si128(v0, mask8);
+      tv1 = _mm_and_si128(v1, mask8);
+      tv2 = _mm_and_si128(v2, mask8);
+      tv3 = _mm_and_si128(v3, mask8);
+      
+      v0 = _mm_packus_epi16(p2, p0);
+      v1 = _mm_packus_epi16(p3, p1);
+      v2 = _mm_packus_epi16(tv2, tv0);
+      v3 = _mm_packus_epi16(tv3, tv1);
+      
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_shuffle_epi8(tables[6][0], si);
+      p1 = _mm_shuffle_epi8(tables[6][1], si);
+      p2 = _mm_shuffle_epi8(tables[6][2], si);
+      p3 = _mm_shuffle_epi8(tables[6][3], si);
+      
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
+  
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
+      
+      v1 = _mm_srli_epi32(v1, 4);
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
+  
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
+      
+      v2 = _mm_srli_epi32(v2, 4);
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
+  
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
+      
+      v3 = _mm_srli_epi32(v3, 4);
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); 
+  
+      tv0 = _mm_unpackhi_epi8(p1, p3);
+      tv1 = _mm_unpackhi_epi8(p0, p2);
+      tv2 = _mm_unpacklo_epi8(p1, p3);
+      tv3 = _mm_unpacklo_epi8(p0, p2);
+      
+      p0 = _mm_unpackhi_epi8(tv1, tv0);
+      p1 = _mm_unpacklo_epi8(tv1, tv0);
+      p2 = _mm_unpackhi_epi8(tv3, tv2);
+      p3 = _mm_unpacklo_epi8(tv3, tv2);
+      
+      _mm_store_si128((__m128i *) d32, p0);
+      _mm_store_si128((__m128i *) (d32+4), p1);
+      _mm_store_si128((__m128i *) (d32+8), p2);
+      _mm_store_si128((__m128i *) (d32+12), p3);
+      d32 += 16;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+
+#endif
+}
+
+static 
+int gf_w32_split_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_split_2_32_lazy_data *ld2;
+  struct gf_split_4_32_lazy_data *ld4;
+  struct gf_w32_split_8_8_data *d8;
+  struct gf_split_8_32_lazy_data *d32;
+  struct gf_split_16_32_lazy_data *d16;
+  uint32_t p, basep;
+  int i, j, exp, ispclmul, issse3;
+  int isneon = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+  ispclmul = 1;
+#else
+  ispclmul = 0;
+#endif
+
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#else
+  issse3 = 0;
+#endif
+#ifdef ARM_NEON
+  isneon = 1;
+#endif
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /* Defaults */
+  
+  gf->inverse.w32 = gf_w32_euclid;
+
+  /* JSP: First handle single multiplication:  
+     If args == 8, then we're doing split 8 8.  
+     Otherwise, if PCLMUL, we use that.
+     Otherwise, we use bytwo_p.
+   */
+
+  if (h->arg1 == 8 && h->arg2 == 8) {
+    gf->multiply.w32 = gf_w32_split_8_8_multiply;
+  } else if (ispclmul) {
+    if ((0xfffe0000 & h->prim_poly) == 0){
+      gf->multiply.w32 = gf_w32_clm_multiply_2;
+    } else if ((0xffc00000 & h->prim_poly) == 0){
+      gf->multiply.w32 = gf_w32_clm_multiply_3;
+    } else if ((0xfe000000 & h->prim_poly) == 0){
+     gf->multiply.w32 = gf_w32_clm_multiply_4;
+    }
+  } else {
+    gf->multiply.w32 = gf_w32_bytwo_p_multiply;
+  }
+
+  /* Easy cases: 16/32 and 2/32 */
+
+  if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) {
+    d16 = (struct gf_split_16_32_lazy_data *) h->private;
+    d16->last_value = 0;
+    gf->multiply_region.w32 = gf_w32_split_16_32_lazy_multiply_region;
+    return 1;
+  }
+
+  if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
+    ld2 = (struct gf_split_2_32_lazy_data *) h->private;
+    ld2->last_value = 0;
+    #ifdef INTEL_SSSE3
+      if (!(h->region_type & GF_REGION_NOSIMD))
+        gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
+    #else
+      gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
+      if(h->region_type & GF_REGION_SIMD) return 0;
+    #endif
+    return 1;
+  } 
+
+  /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
+
+  if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
+      ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
+    ld4 = (struct gf_split_4_32_lazy_data *) h->private;
+    ld4->last_value = 0;
+    if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
+    } else if (isneon) {
+#ifdef ARM_NEON
+      gf_w32_neon_split_init(gf);
+#endif
+    } else if (h->region_type & GF_REGION_ALTMAP) {
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
+    } else {
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
+    }
+    return 1;
+  } 
+
+  /* 8/32 or Default + no SSE */
+
+  if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) || 
+       h->mult_type == GF_MULT_DEFAULT) {
+    d32 = (struct gf_split_8_32_lazy_data *) h->private;
+    d32->last_value = 0;
+    gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region;
+    return 1;
+  }
+
+  /* Finally, if args == 8, then we have to set up the tables here. */
+
+  if (h->arg1 == 8 && h->arg2 == 8) {
+    d8 = (struct gf_w32_split_8_8_data *) h->private;
+    d8->last_value = 0;
+    gf->multiply.w32 = gf_w32_split_8_8_multiply;
+    gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region;
+    basep = 1;
+    for (exp = 0; exp < 7; exp++) {
+      for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
+      for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
+      d8->tables[exp][1][1] = basep;
+      for (i = 2; i < 256; i++) {
+        if (i&1) {
+          p = d8->tables[exp][i^1][1];
+          d8->tables[exp][i][1] = p ^ basep;
+        } else {
+          p = d8->tables[exp][i>>1][1];
+          d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
+        }
+      }
+      for (i = 1; i < 256; i++) {
+        p = d8->tables[exp][i][1];
+        for (j = 1; j < 256; j++) {
+          if (j&1) {
+            d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
+          } else {
+            d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
+          }
+        }
+      }
+      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
+    }
+    return 1;
+  }
+
+  /* If we get here, then the arguments were bad. */
+
+  return 0;
+}
+
+static
+int gf_w32_group_init(gf_t *gf)
+{
+  uint32_t i, j, p, index;
+  struct gf_w32_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint32_t g_r, g_s;
+
+  g_s = h->arg1;
+  g_r = h->arg2;
+
+  gd = (struct gf_w32_group_data *) h->private;
+  gd->shift = (uint32_t *) (&(gd->memory));
+  gd->reduce = gd->shift + (1 << g_s);
+
+  gd->rmask = (1 << g_r) - 1;
+  gd->rmask <<= 32;
+
+  gd->tshift = 32 % g_s;
+  if (gd->tshift == 0) gd->tshift = g_s;
+  gd->tshift = (32 - gd->tshift);
+  gd->tshift = ((gd->tshift-1)/g_r) * g_r;
+
+  gd->reduce[0] = 0;
+  for (i = 0; i < ((uint32_t)1 << g_r); i++) {
+    p = 0;
+    index = 0;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        p ^= (h->prim_poly << j);
+        index ^= (1 << j);
+        index ^= (h->prim_poly >> (32-j));
+      }
+    }
+    gd->reduce[index] = p;
+  }
+
+  if (g_s == g_r) {
+    gf->multiply.w32 = gf_w32_group_s_equals_r_multiply;
+    gf->multiply_region.w32 = gf_w32_group_s_equals_r_multiply_region; 
+  } else {
+    gf->multiply.w32 = gf_w32_group_multiply;
+    gf->multiply_region.w32 = gf_w32_group_multiply_region;
+  }
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = gf_w32_euclid;
+
+  return 1;
+}
+
+
+static
+uint32_t
+gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = b & 0x0000ffff;
+  uint32_t b1 = (b & 0xffff0000) >> 16;
+  uint32_t a0 = a & 0x0000ffff;
+  uint32_t a1 = (a & 0xffff0000) >> 16;
+  uint32_t a1b1;
+  uint32_t rv;
+  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+  rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1);
+  return rv;
+}
+
+/* JSP: This could be made faster. Someday, when I'm bored. */
+
+static
+uint32_t
+gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint32_t b0 = b & 0x0000ffff;
+  uint32_t b1 = b >> 16;
+  uint32_t a0 = a & 0x0000ffff;
+  uint32_t a1 = a >> 16;
+  uint32_t a1b1, prod;
+  uint16_t *log, *alog;
+  struct gf_w32_composite_data *cd;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  log = cd->log;
+  alog = cd->alog;
+
+  a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+  prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+  prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+  prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+  prod <<= 16;
+  prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+  prod ^= a1b1;
+  return prod;
+}
+
+/*
+ * Composite field division trick (explained in 2007 tech report)
+ *
+ * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
+ *
+ * let c = b^-1
+ *
+ * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
+ *
+ * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
+ *
+ * let d = b1c1 and d+1 = b0c0
+ *
+ * solve s*b1c1+b1c0+b0c1 = 0
+ *
+ * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
+ *
+ * c0 = (d+1)b0^-1
+ * c1 = d*b1^-1
+ *
+ * a / b = a * c
+ */
+
+static
+uint32_t
+gf_w32_composite_inverse(gf_t *gf, uint32_t a)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint16_t a0 = a & 0x0000ffff;
+  uint16_t a1 = (a & 0xffff0000) >> 16;
+  uint16_t c0, c1, d, tmp;
+  uint32_t c;
+  uint16_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w32(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    a0inv = base_gf->inverse.w32(base_gf, a0);
+
+    d = base_gf->multiply.w32(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w32(base_gf, tmp);
+
+    d = base_gf->multiply.w32(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w32(base_gf, d, a1inv);
+  }
+
+  c = c0 | (c1 << 16);
+
+  return c;
+}
+
+static
+void
+gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = val & 0x0000ffff;
+  uint32_t b1 = (val & 0xffff0000) >> 16;
+  uint32_t *s32, *d32, *top;
+  uint16_t a0, a1, a1b1, *log, *alog;
+  uint32_t prod;
+  gf_region_data rd;
+  struct gf_w32_composite_data *cd;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  log = cd->log;
+  alog = cd->alog;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  
+  s32 = rd.s_start;
+  d32 = rd.d_start;
+  top = rd.d_top;
+
+  if (log == NULL) {
+    if (xor) {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); 
+        s32++;
+        d32++;
+      }
+    } else {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); 
+        s32++;
+        d32++;
+      }
+    }
+  } else {
+    if (xor) {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+
+        prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+        prod <<= 16;
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+        prod ^= a1b1;
+        *d32 ^= prod;
+        s32++;
+        d32++;
+      }
+    } else {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+  
+        prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+        prod <<= 16;
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+        prod ^= a1b1;
+        
+        *d32 = prod;
+        s32++;
+        d32++;
+      }
+    }
+  }
+}
+
+static
+void
+gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint16_t    val0 = val & 0x0000ffff;
+  uint16_t    val1 = (val & 0xffff0000) >> 16;
+  gf_region_data rd;
+  int sub_reg_size;
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;
+
+  /* JSP: I want the two pointers aligned wrt each other on 16 byte
+     boundaries.  So I'm going to make sure that the area on
+     which the two operate is a multiple of 32. Of course, that
+     junks up the mapping, but so be it -- that's why we have extract_word.... */
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t *)  rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+  
+  base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+int gf_w32_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_w32_composite_data *cd;
+
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  cd->log = gf_w16_get_log_table(h->base_gf);
+  cd->alog = gf_w16_get_mult_alog_table(h->base_gf);
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt;
+  } else {
+    gf->multiply_region.w32 = gf_w32_composite_multiply_region;
+  }
+
+  if (cd->log == NULL) {
+    gf->multiply.w32 = gf_w32_composite_multiply_recursive;
+  } else {
+    gf->multiply.w32 = gf_w32_composite_multiply_inline; 
+  }
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = gf_w32_composite_inverse;
+
+  return 1;
+}
+
+
+
+int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  int issse3 = 0;
+  int isneon = 0;
+
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#endif
+#ifdef ARM_NEON
+  isneon = 1;
+#endif
+
+  switch(mult_type)
+  {
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64;
+      break;
+    case GF_MULT_GROUP: 
+      return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) +
+               sizeof(uint32_t) * (1 << arg1) +
+               sizeof(uint32_t) * (1 << arg2) + 64;
+      break;
+    case GF_MULT_DEFAULT:
+
+    case GF_MULT_SPLIT_TABLE: 
+        if (arg1 == 8 && arg2 == 8){
+          return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64;
+        }
+        if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64;
+        }
+        if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
+        }
+        if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || 
+             (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
+        }
+        if ((arg1 == 4 && arg2 == 32) || 
+            (arg2 == 4 && arg1 == 32) ||
+            mult_type == GF_MULT_DEFAULT) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
+        }
+        return 0;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_CARRY_FREE_GK:
+      return sizeof(gf_internal_t) + sizeof(uint64_t)*2;
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_COMPOSITE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64;
+      break;
+
+    default:
+      return 0;
+   }
+   return 0;
+}
+
+int gf_w32_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) { 
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else { 
+
+      /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/
+
+      /* h->prim_poly = 0xc5; */
+
+      /* Allen: The following is the traditional primitive polynomial for GF(2^32) */
+
+      h->prim_poly = 0x400007;
+    } 
+  }
+
+  /* No leading one */
+
+  if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff;
+    
+  gf->multiply.w32 = NULL;
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = NULL;
+  gf->multiply_region.w32 = NULL;
+
+  switch(h->mult_type) {
+    case GF_MULT_CARRY_FREE:    if (gf_w32_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE_GK: if (gf_w32_cfmgk_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:         if (gf_w32_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:     if (gf_w32_composite_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE:   if (gf_w32_split_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:         if (gf_w32_group_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p:   
+    case GF_MULT_BYTWO_b:       if (gf_w32_bytwo_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    gf->divide.w32 = gf_w32_divide_from_inverse;
+    gf->inverse.w32 = gf_w32_euclid;
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    gf->divide.w32 = gf_w32_divide_from_inverse;
+    gf->inverse.w32 = gf_w32_matrix;
+  }
+
+  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+    gf->divide.w32 = gf_w32_divide_from_inverse;
+  }
+  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
+    gf->inverse.w32 = gf_w32_inverse_from_divide;
+  }
+  if (h->region_type == GF_REGION_CAUCHY) {
+    gf->extract_word.w32 = gf_wgen_extract_word;
+    gf->multiply_region.w32 = gf_wgen_cauchy_region;
+  } else if (h->region_type & GF_REGION_ALTMAP) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      gf->extract_word.w32 = gf_w32_composite_extract_word;
+    } else {
+      gf->extract_word.w32 = gf_w32_split_extract_word;
+    }
+  } else {
+    gf->extract_word.w32 = gf_w32_extract_word;
+  }
+  return 1;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w4.c b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c
new file mode 100644
index 0000000..0e86aa8
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c
@@ -0,0 +1,2051 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w4.c
+ *
+ * Routines for 4-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w4.h"
+
+#define AB2(ip, am1 ,am2, b, t1, t2) {\
+  t1 = (b << 1) & am1;\
+  t2 = b & am2; \
+  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
+  b = (t1 ^ (t2 & ip));}
+
+// ToDo(KMG/JSP): Why is 0x88 hard-coded?
+#define SSE_AB2(pp, m1, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+/* ------------------------------------------------------------
+   JSP: These are basic and work from multiple implementations.
+ */
+
+static
+inline
+gf_val_32_t gf_w4_inverse_from_divide (gf_t *gf, gf_val_32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+gf_val_32_t gf_w4_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+inline
+gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
+{
+  gf_val_32_t e_i, e_im1, e_ip1;
+  gf_val_32_t d_i, d_im1, d_ip1;
+  gf_val_32_t y_i, y_im1, y_ip1;
+  gf_val_32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = 4;
+  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+static 
+gf_val_32_t gf_w4_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint8_t *r8, v;
+
+  r8 = (uint8_t *) start;
+  v = r8[index/2];
+  if (index%2) {
+    return v >> 4;
+  } else {
+    return v&0xf;
+  }
+}
+
+
+static
+inline
+gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b)
+{
+  return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+
+static
+inline
+gf_val_32_t
+gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint8_t product, i, pp;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (1 << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+/* Ben: This function works, but it is 33% slower than the normal shift mult */
+
+static
+inline
+gf_val_32_t
+gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0);
+  b = _mm_insert_epi32 (a, b4, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL));
+
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only
+     have to do the reduction only once, because (w-2)/z == 1. Where
+     z is equal to the number of zeros after the leading 1.
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_epi64 shifts the result to the right by 4 bits. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result. */
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+static
+void
+gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int 
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
+             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
+             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+/* ------------------------------------------------------------
+  IMPLEMENTATION: LOG_TABLE: 
+
+  JSP: This is a basic log-antilog implementation.  
+       I'm not going to spend any time optimizing it because the
+       other techniques are faster for both single and region
+       operations. 
+ */
+
+static
+inline
+gf_val_32_t
+gf_w4_log_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_logtable_data *ltd;
+    
+  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_log_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int log_sum = 0;
+  struct gf_logtable_data *ltd;
+    
+  if (a == 0 || b == 0) return 0;
+  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b];
+  return (ltd->antilog_tbl_div[log_sum]);
+}
+
+static
+void 
+gf_w4_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t lv, b, c;
+  uint8_t *s8, *d8;
+  
+  struct gf_logtable_data *ltd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  lv = ltd->log_tbl[val];
+
+  for (i = 0; i < bytes; i++) {
+    c = (xor) ? d8[i] : 0;
+    b = (s8[i] >> GF_FIELD_WIDTH);
+    c ^= (b == 0) ? 0 : (ltd->antilog_tbl[lv + ltd->log_tbl[b]] << GF_FIELD_WIDTH);
+    b = (s8[i] & 0xf);
+    c ^= (b == 0) ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[b]];
+    d8[i] = c;
+  }
+}
+
+static 
+int gf_w4_log_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_logtable_data *ltd;
+  int i, b;
+
+  h = (gf_internal_t *) gf->scratch;
+  ltd = h->private;
+
+  for (i = 0; i < GF_FIELD_SIZE; i++)
+    ltd->log_tbl[i]=0;
+
+  ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1);
+  b = 1;
+  i = 0;
+  do {
+    if (ltd->log_tbl[b] != 0 && i != 0) {
+      fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n");
+      return 0;
+    }
+    ltd->log_tbl[b] = i;
+    ltd->antilog_tbl[i] = b;
+    ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
+    b <<= 1;
+    i++;
+    if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
+  } while (b != 1);
+
+  if (i != GF_FIELD_SIZE - 1) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+    
+  gf->inverse.w32 = gf_w4_inverse_from_divide;
+  gf->divide.w32 = gf_w4_log_divide;
+  gf->multiply.w32 = gf_w4_log_multiply;
+  gf->multiply_region.w32 = gf_w4_log_multiply_region;
+  return 1;
+}
+
+/* ------------------------------------------------------------
+  IMPLEMENTATION: SINGLE TABLE: JSP. 
+ */
+
+static
+inline
+gf_val_32_t
+gf_w4_single_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_single_table_data *std;
+    
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->mult[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_single_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_single_table_data *std;
+    
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->div[a][b];
+}
+
+static
+void 
+gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t b, c;
+  uint8_t *s8, *d8;
+  
+  struct gf_single_table_data *std;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  for (i = 0; i < bytes; i++) {
+    c = (xor) ? d8[i] : 0;
+    b = (s8[i] >> GF_FIELD_WIDTH);
+    c ^= (std->mult[val][b] << GF_FIELD_WIDTH);
+    b = (s8[i] & 0xf);
+    c ^= (std->mult[val][b]);
+    d8[i] = c;
+  }
+}
+
+#define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); }
+
+#ifdef INTEL_SSSE3
+static
+void 
+gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint8_t *base, *sptr, *dptr, *top;
+  __m128i  tl, loset, r, va, th;
+  
+  struct gf_single_table_data *std;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  base = (uint8_t *) std->mult;
+  base += (val << GF_FIELD_WIDTH);
+
+  gf_do_initial_region_alignment(&rd);
+
+  tl = _mm_loadu_si128((__m128i *)base);
+  th = _mm_slli_epi64(tl, 4);
+  loset = _mm_set1_epi8 (0x0f);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+  top = rd.s_top;
+
+  while (sptr < (uint8_t *) top) {
+    va = _mm_load_si128 ((__m128i *)(sptr));
+    r = _mm_and_si128 (loset, va);
+    r = _mm_shuffle_epi8 (tl, r);
+    va = _mm_srli_epi64 (va, 4);
+    va = _mm_and_si128 (loset, va);
+    va = _mm_shuffle_epi8 (th, va);
+    r = _mm_xor_si128 (r, va);
+    va = (xor) ? _mm_load_si128 ((__m128i *)(dptr)) : _mm_setzero_si128(); 
+    r = _mm_xor_si128 (r, va);
+    _mm_store_si128 ((__m128i *)(dptr), r);
+    dptr += 16;
+    sptr += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+
+}
+#endif
+
+static 
+int gf_w4_single_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_single_table_data *std;
+  int a, b, prod;
+
+
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_single_table_data *)h->private;
+
+  bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w4_shift_multiply(gf, a, b);
+      std->mult[a][b] = prod;
+      std->div[prod][b] = a;
+    }
+  }
+
+  gf->inverse.w32 = NULL;
+  gf->divide.w32 = gf_w4_single_table_divide;
+  gf->multiply.w32 = gf_w4_single_table_multiply;
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
+      gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
+    else
+    #if defined(INTEL_SSSE3)
+      gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
+    #elif defined(ARM_NEON)
+      gf_w4_neon_single_table_init(gf);
+    #endif
+  #else
+    gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
+    if (h->region_type & GF_REGION_SIMD) return 0;
+  #endif
+
+  return 1;
+}
+
+/* ------------------------------------------------------------
+  IMPLEMENTATION: DOUBLE TABLE: JSP. 
+ */
+
+static
+inline
+gf_val_32_t
+gf_w4_double_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_double_table_data *std;
+    
+  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->mult[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_double_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_double_table_data *std;
+    
+  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->div[a][b];
+}
+
+static
+void 
+gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8, *base;
+  gf_region_data rd;
+  struct gf_double_table_data *std;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+
+  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+  base = (uint8_t *) std->mult;
+  base += (val << GF_DOUBLE_WIDTH);
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) d8[i] ^= base[s8[i]];
+  } else {
+    for (i = 0; i < bytes; i++) d8[i] = base[s8[i]];
+  }
+}
+
+static 
+int gf_w4_double_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_double_table_data *std;
+  int a, b, c, prod, ab;
+  uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_double_table_data *)h->private;
+
+  bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w4_shift_multiply(gf, a, b);
+      mult[a][b] = prod;
+      std->div[prod][b] = a;
+    }
+  }
+  bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  for (a = 0; a < GF_FIELD_SIZE; a++) {
+    for (b = 0; b < GF_FIELD_SIZE; b++) {
+      ab = mult[a][b];
+      for (c = 0; c < GF_FIELD_SIZE; c++) {
+        std->mult[a][(b << 4) | c] = ((ab << 4) | mult[a][c]);
+      }
+    }
+  }
+
+  gf->inverse.w32 = NULL;
+  gf->divide.w32 = gf_w4_double_table_divide;
+  gf->multiply.w32 = gf_w4_double_table_multiply;
+  gf->multiply_region.w32 = gf_w4_double_table_multiply_region;
+  return 1;
+}
+
+
+static
+inline
+gf_val_32_t
+gf_w4_quad_table_lazy_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_quad_table_lazy_data *std;
+    
+  std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->div[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_quad_table_lazy_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_quad_table_lazy_data *std;
+    
+  std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->smult[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_quad_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_quad_table_data *std;
+    
+  std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->div[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_quad_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_quad_table_data *std;
+  uint16_t v;
+    
+  std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  v = std->mult[a][b];
+  return v;
+}
+
+static
+void 
+gf_w4_quad_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t *base;
+  gf_region_data rd;
+  struct gf_quad_table_data *std;
+  struct gf_quad_table_lazy_data *ltd;
+  gf_internal_t *h;
+  int a, b, c, d, va, vb, vc, vd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) (gf->scratch);
+  if (h->region_type & GF_REGION_LAZY) {
+    ltd = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
+    base = ltd->mult;
+    for (a = 0; a < 16; a++) {
+      va = (ltd->smult[val][a] << 12);
+      for (b = 0; b < 16; b++) {
+        vb = (ltd->smult[val][b] << 8);
+        for (c = 0; c < 16; c++) {
+          vc = (ltd->smult[val][c] << 4);
+          for (d = 0; d < 16; d++) {
+            vd = ltd->smult[val][d];
+            base[(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd);
+          }
+        }
+      }
+    }
+  } else {
+    std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+    base = &(std->mult[val][0]);
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+  gf_two_byte_region_table_multiply(&rd, base);
+  gf_do_final_region_alignment(&rd);
+}
+
+static 
+int gf_w4_quad_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_quad_table_data *std;
+  int prod, val, a, b, c, d, va, vb, vc, vd;
+  uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_quad_table_data *)h->private;
+
+  bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w4_shift_multiply(gf, a, b);
+      mult[a][b] = prod;
+      std->div[prod][b] = a;
+    }
+  }
+
+  for (val = 0; val < 16; val++) {
+    for (a = 0; a < 16; a++) {
+      va = (mult[val][a] << 12);
+      for (b = 0; b < 16; b++) {
+        vb = (mult[val][b] << 8);
+        for (c = 0; c < 16; c++) {
+          vc = (mult[val][c] << 4);
+          for (d = 0; d < 16; d++) {
+            vd = mult[val][d];
+            std->mult[val][(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd);
+          }
+        }
+      }
+    }
+  }
+
+  gf->inverse.w32 = NULL;
+  gf->divide.w32 = gf_w4_quad_table_divide;
+  gf->multiply.w32 = gf_w4_quad_table_multiply;
+  gf->multiply_region.w32 = gf_w4_quad_table_multiply_region;
+  return 1;
+}
+static 
+int gf_w4_quad_table_lazy_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_quad_table_lazy_data *std;
+  int a, b, prod, loga, logb;
+  uint8_t log_tbl[GF_FIELD_SIZE];
+  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
+
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_quad_table_lazy_data *)h->private;
+
+  b = 1;
+  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
+      log_tbl[b] = a;
+      antilog_tbl[a] = b;
+      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
+      b <<= 1;
+      if (b & GF_FIELD_SIZE) {
+          b = b ^ h->prim_poly;
+      }
+  }
+
+  bzero(std->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    loga = log_tbl[a];
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      logb = log_tbl[b];
+      prod = antilog_tbl[loga+logb];
+      std->smult[a][b] = prod;
+      std->div[prod][b] = a;
+    }
+  }
+
+  gf->inverse.w32 = NULL;
+  gf->divide.w32 = gf_w4_quad_table_lazy_divide;
+  gf->multiply.w32 = gf_w4_quad_table_lazy_multiply;
+  gf->multiply_region.w32 = gf_w4_quad_table_multiply_region;
+  return 1;
+}
+
+static 
+int gf_w4_table_init(gf_t *gf)
+{
+  int rt;
+  gf_internal_t *h;
+  int simd = 0;
+
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+  simd = 1;
+#endif
+
+  h = (gf_internal_t *) gf->scratch;
+  rt = (h->region_type);
+
+  if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
+
+  if (rt & GF_REGION_DOUBLE_TABLE) {
+    return gf_w4_double_table_init(gf);
+  } else if (rt & GF_REGION_QUAD_TABLE) {
+    if (rt & GF_REGION_LAZY) {
+      return gf_w4_quad_table_lazy_init(gf);
+    } else {
+      return gf_w4_quad_table_init(gf);
+    }
+  } else {
+    return gf_w4_single_table_init(gf);
+  }
+  return 0;
+}
+
+/* ------------------------------------------------------------
+   JSP: GF_MULT_BYTWO_p and _b: See the paper.
+*/
+
+static
+inline
+gf_val_32_t
+gf_w4_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  
+  prod = 0;
+  pmask = 0x8;
+  amask = 0x8;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x8;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+void 
+gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
+  gf_region_data rd;
+  struct gf_bytwo_data *btd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x8;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else { 
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x8;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define BYTWO_P_ONESTEP {\
+      SSE_AB2(pp, m1, prod, t1, t2); \
+      t1 = _mm_and_si128(v, one); \
+      t1 = _mm_sub_epi8(t1, one); \
+      t1 = _mm_and_si128(t1, ta); \
+      prod = _mm_xor_si128(prod, t1); \
+      v = _mm_srli_epi64(v, 1); }
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint8_t vrev;
+  __m128i pp, m1, ta, prod, t1, t2, tp, one, v;
+  struct gf_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  vrev = 0;
+  for (i = 0; i < 4; i++) {
+    vrev <<= 1;
+    if (!(val & (1 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  one = _mm_set1_epi8(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi8(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+/*
+static
+void 
+gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSE2
+  uint8_t *d8, *s8, tb;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  if (xor) {
+    while (d8 < (uint8_t *) rd.d_top) {
+      va = _mm_load_si128 ((__m128i *)(s8));
+      vb = _mm_load_si128 ((__m128i *)(d8));
+      tb = val;
+      while (1) {
+        if (tb & 1) vb = _mm_xor_si128(vb, va);
+        tb >>= 1;
+        if (tb == 0) break;
+        SSE_AB2(pp, m1, m2, va, t1, t2);
+      }
+      _mm_store_si128((__m128i *)d8, vb);
+      d8 += 16;
+      s8 += 16;
+    }
+  } else {
+    while (d8 < (uint8_t *) rd.d_top) {
+      va = _mm_load_si128 ((__m128i *)(s8));
+      vb = _mm_setzero_si128 ();
+      tb = val;
+      while (1) {
+        if (tb & 1) vb = _mm_xor_si128(vb, va);
+        tb >>= 1;
+        if (tb == 0) break;
+        t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1);
+        t2 = _mm_and_si128(va, m2);
+        t2 = _mm_sub_epi64 (
+          _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1)));
+        va = _mm_xor_si128(t1, _mm_and_si128(t2, pp));
+      }
+      _mm_store_si128((__m128i *)d8, vb);
+      d8 += 16;
+      s8 += 16;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+*/
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    SSE_AB2(pp, m1, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = va;
+    SSE_AB2(pp, m1, va, t1, t2);
+    va = _mm_xor_si128(va, vb);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = va;
+    SSE_AB2(pp, m1, va, t1, t2);
+    SSE_AB2(pp, m1, va, t1, t2);
+    va = _mm_xor_si128(va, vb);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = va;
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(va, vb);
+    SSE_AB2(pp, m1, va, t1, t2);
+    va = _mm_xor_si128(va, vb);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = va;
+    SSE_AB2(pp, m1, va, t1, t2);
+    va = _mm_xor_si128(va, vb);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void 
+gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint8_t *d8, *s8, tb;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  switch (val) {
+    case 2:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_2_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_2_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 3:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_3_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_3_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 4:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_4_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_4_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 5:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_5_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_5_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 6:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_6_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_6_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 7:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_7_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_7_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+  }
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  if (xor) {
+    while (d8 < (uint8_t *) rd.d_top) {
+      va = _mm_load_si128 ((__m128i *)(s8));
+      vb = _mm_load_si128 ((__m128i *)(d8));
+      tb = val;
+      while (1) {
+        if (tb & 1) vb = _mm_xor_si128(vb, va);
+        tb >>= 1;
+        if (tb == 0) break;
+        SSE_AB2(pp, m1, va, t1, t2);
+      }
+      _mm_store_si128((__m128i *)d8, vb);
+      d8 += 16;
+      s8 += 16;
+    }
+  } else {
+    while (d8 < (uint8_t *) rd.d_top) {
+      va = _mm_load_si128 ((__m128i *)(s8));
+      vb = _mm_setzero_si128 ();
+      tb = val;
+      while (1) {
+        if (tb & 1) vb = _mm_xor_si128(vb, va);
+        tb >>= 1;
+        if (tb == 0) break;
+        t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1);
+        t2 = _mm_and_si128(va, m2);
+        t2 = _mm_sub_epi64 (
+          _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1)));
+        va = _mm_xor_si128(t1, _mm_and_si128(t2, pp));
+      }
+      _mm_store_si128((__m128i *)d8, vb);
+      d8 += 16;
+      s8 += 16;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+void 
+gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
+  struct gf_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  switch (val) {
+  case 1:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        *d64 ^= *s64;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        *d64 = *s64;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 2:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 3:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 4:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 5:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 6:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 7:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 8:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 9:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 10:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 11:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 12:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 13:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 14:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 15:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  default:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = *d64 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = 0 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static 
+int gf_w4_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  uint64_t ip, m1, m2;
+  struct gf_bytwo_data *btd;
+
+  h = (gf_internal_t *) gf->scratch;
+  btd = (struct gf_bytwo_data *) (h->private);
+  ip = h->prim_poly & 0xf;
+  m1 = 0xe;
+  m2 = 0x8;
+  btd->prim_poly = 0;
+  btd->mask1 = 0;
+  btd->mask2 = 0;
+
+  while (ip != 0) {
+    btd->prim_poly |= ip;
+    btd->mask1 |= m1;
+    btd->mask2 |= m2;
+    ip <<= GF_FIELD_WIDTH;
+    m1 <<= GF_FIELD_WIDTH;
+    m2 <<= GF_FIELD_WIDTH;
+  }
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    gf->multiply.w32 = gf_w4_bytwo_p_multiply;
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
+    #else
+      gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
+      if (h->region_type & GF_REGION_SIMD)
+        return 0;
+    #endif
+  } else {
+    gf->multiply.w32 = gf_w4_bytwo_b_multiply;
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
+    #else
+      gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
+      if (h->region_type & GF_REGION_SIMD)
+        return 0;
+    #endif
+  }
+  return 1;
+}
+
+
+static 
+int gf_w4_cfm_init(gf_t *gf)
+{
+#if defined(INTEL_SSE4_PCLMUL)
+  gf->multiply.w32 = gf_w4_clm_multiply;
+  return 1;
+#elif defined(ARM_NEON)
+  return gf_w4_neon_cfm_init(gf);
+#endif
+  return 0;
+}
+
+static 
+int gf_w4_shift_init(gf_t *gf)
+{
+  gf->multiply.w32 = gf_w4_shift_multiply;
+  return 1;
+}
+
+/* JSP: I'm putting all error-checking into gf_error_check(), so you don't 
+   have to do error checking in scratch_size or in init */
+
+int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  int issse3 = 0, isneon = 0;
+
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#endif
+#ifdef ARM_NEON
+  isneon = 1;
+#endif
+
+  switch(mult_type)
+  {
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data);
+      break;
+    case GF_MULT_DEFAULT:
+    case GF_MULT_TABLE:
+      if (region_type == GF_REGION_CAUCHY) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
+      }
+
+      if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
+          region_type = GF_REGION_DOUBLE_TABLE;
+
+      if (region_type & GF_REGION_DOUBLE_TABLE) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
+      } else if (region_type & GF_REGION_QUAD_TABLE) {
+        if ((region_type & GF_REGION_LAZY) == 0) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64;
+        } else {
+          return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64;
+        }
+      } else {
+        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
+      }
+      break;
+
+    case GF_MULT_LOG_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    default:
+      return 0;
+   }
+  return 0;
+}
+
+int
+gf_w4_init (gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (h->prim_poly == 0) h->prim_poly = 0x13;
+  h->prim_poly |= 0x10;
+  gf->multiply.w32 = NULL;
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = NULL;
+  gf->multiply_region.w32 = NULL;
+  gf->extract_word.w32 = gf_w4_extract_word;
+
+  switch(h->mult_type) {
+    case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:      if (gf_w4_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p:   
+    case GF_MULT_BYTWO_b:    if (gf_w4_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE:  if (gf_w4_log_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT:   
+    case GF_MULT_TABLE:      if (gf_w4_table_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    gf->divide.w32 = gf_w4_divide_from_inverse;
+    gf->inverse.w32 = gf_w4_euclid;
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    gf->divide.w32 = gf_w4_divide_from_inverse;
+    gf->inverse.w32 = gf_w4_matrix;
+  }
+
+  if (gf->divide.w32 == NULL) {
+    gf->divide.w32 = gf_w4_divide_from_inverse;
+    if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid;
+  }
+
+  if (gf->inverse.w32 == NULL)  gf->inverse.w32 = gf_w4_inverse_from_divide;
+
+  if (h->region_type == GF_REGION_CAUCHY) {
+    gf->multiply_region.w32 = gf_wgen_cauchy_region;
+    gf->extract_word.w32 = gf_wgen_extract_word;
+  }
+
+  if (gf->multiply_region.w32 == NULL) {
+    gf->multiply_region.w32 = gf_w4_multiply_region_from_single;
+  }
+
+  return 1;
+}
+
+/* Inline setup functions */
+
+uint8_t *gf_w4_get_mult_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_single_table_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w4_single_table_multiply) {
+    std = (struct gf_single_table_data *) h->private;
+    return (uint8_t *) std->mult;
+  } 
+  return NULL;
+}
+    
+uint8_t *gf_w4_get_div_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_single_table_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w4_single_table_multiply) {
+    std = (struct gf_single_table_data *) h->private;
+    return (uint8_t *) std->div;
+  } 
+  return NULL;
+}
+
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w64.c b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c
new file mode 100644
index 0000000..ba75d8c
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c
@@ -0,0 +1,2218 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w64.c
+ *
+ * Routines for 64-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w64.h"
+
+static
+inline
+gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a)
+{
+  return gf->divide.w64(gf, 1, a);
+}
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+static
+inline
+gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  b = gf->inverse.w64(gf, b);
+  return gf->multiply.w64(gf, a, b);
+}
+
+static
+void
+gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+  uint32_t i;
+  gf_val_64_t *s64;
+  gf_val_64_t *d64;
+
+  s64 = (gf_val_64_t *) src;
+  d64 = (gf_val_64_t *) dest;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
+      d64[i] ^= gf->multiply.w64(gf, val, s64[i]);
+    }
+  } else {
+    for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
+      d64[i] = gf->multiply.w64(gf, val, s64[i]);
+    }
+  }
+}
+
+#if defined(INTEL_SSE4_PCLMUL) 
+static
+void
+gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+  gf_val_64_t *s64, *d64, *top;
+  gf_region_data rd;
+
+  __m128i         a, b;
+  __m128i         result, r1;
+  __m128i         prim_poly;
+  __m128i         w;
+  __m128i         m1, m3, m4;
+  gf_internal_t * h = gf->scratch;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
+  m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
+  m3 = _mm_slli_si128(m1, 8);
+  m4 = _mm_slli_si128(m3, 4);
+
+  s64 = (gf_val_64_t *) rd.s_start;
+  d64 = (gf_val_64_t *) rd.d_start;
+  top = (gf_val_64_t *) rd.d_top;
+
+  if (xor) {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);  
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+      
+      r1 = _mm_load_si128((__m128i *) d64);
+      result = _mm_xor_si128(r1, result);
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  } else {
+    while (d64 != top) {
+      
+      a = _mm_load_si128((__m128i *) s64);  
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      
+      result = _mm_unpacklo_epi64(result, r1);
+
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+  gf_val_64_t *s64, *d64, *top;
+  gf_region_data rd;
+
+  __m128i         a, b;
+  __m128i         result, r1;
+  __m128i         prim_poly;
+  __m128i         w;
+  __m128i         m1, m3, m4;
+  gf_internal_t * h = gf->scratch;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+  
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
+  m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
+  m3 = _mm_slli_si128(m1, 8);
+  m4 = _mm_slli_si128(m3, 4);
+
+  s64 = (gf_val_64_t *) rd.s_start;
+  d64 = (gf_val_64_t *) rd.d_start;
+  top = (gf_val_64_t *) rd.d_top;
+
+  if (xor) {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+
+      r1 = _mm_load_si128((__m128i *) d64);
+      result = _mm_xor_si128(r1, result);
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  } else {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2; 
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+  inline
+gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
+{
+  gf_val_64_t e_i, e_im1, e_ip1;
+  gf_val_64_t d_i, d_im1, d_ip1;
+  gf_val_64_t y_i, y_im1, y_ip1;
+  gf_val_64_t c_i;
+  gf_val_64_t one = 1;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = 64;
+  for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (one << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      d_ip1--;
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
+   include it for completeness.  It does have the feature that it requires no
+   extra memory.  
+*/
+
+static
+inline
+gf_val_64_t
+gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+{
+  uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* Allen: set leading one of primitive polynomial */
+  
+  a = a64;
+  bl = 0;
+  br = b64;
+  one = 1;
+  lbit = (one << 63);
+
+  pl = 0; /* Allen: left side of product */
+  pr = 0; /* Allen: right side of product */
+
+  /* Allen: unlike the corresponding functions for smaller word sizes,
+   * this loop carries out the initial carryless multiply by
+   * shifting b itself rather than simply looking at successively
+   * higher shifts of b */
+  
+  for (i = 0; i < GF_FIELD_WIDTH; i++) {
+    if (a & (one << i)) {
+      pl ^= bl;
+      pr ^= br;
+    }
+
+    bl <<= 1;
+    if (br & lbit) bl ^= 1;
+    br <<= 1;
+  }
+
+  /* Allen: the name of the variable "one" is no longer descriptive at this point */
+  
+  one = lbit >> 1;
+  ppl = (h->prim_poly >> 2) | one;
+  ppr = (h->prim_poly << (GF_FIELD_WIDTH-2));
+  while (one != 0) {
+    if (pl & one) {
+      pl ^= ppl;
+      pr ^= ppr;
+    }
+    one >>= 1;
+    ppr >>= 1;
+    if (ppl & 1) ppr ^= lbit;
+    ppl >>= 1;
+  }
+  return pr;
+}
+
+/*
+ * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
+ */
+
+static
+inline
+gf_val_64_t
+gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+{
+       gf_val_64_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL) 
+
+        __m128i         a, b;
+        __m128i         result;
+        __m128i         prim_poly;
+        __m128i         v, w;
+        gf_internal_t * h = gf->scratch;
+
+        a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
+        b = _mm_insert_epi64 (a, b64, 0); 
+        prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+        /* Do the initial multiply */
+   
+        result = _mm_clmulepi64_si128 (a, b, 0);
+        
+        /* Mask off the high order 32 bits using subtraction of the polynomial.
+         * NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
+         */
+
+        /* Adam: We cant include the leading one in the 64 bit pclmul,
+         so we need to split up the high 8 bytes of the result into two 
+         parts before we multiply them with the prim_poly.*/
+
+        v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+        w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+        result = _mm_xor_si128 (result, w);
+        v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+        w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+        result = _mm_xor_si128 (result, w);
+
+        rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
+#endif
+        return rv;
+}
+ 
+static
+inline
+gf_val_64_t
+gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+{
+  gf_val_64_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL) 
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
+  b = _mm_insert_epi64 (a, b64, 0);
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+ 
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+
+  rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
+#endif
+  return rv;
+}
+
+
+  void
+gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+#if defined(INTEL_SSE4_PCLMUL) 
+  gf_internal_t *h;
+  uint8_t *s8, *d8, *dtop;
+  gf_region_data rd;
+  __m128i  v, b, m, prim_poly, c, fr, w, result;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+  dtop = (uint8_t *) rd.d_top;
+
+  v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
+  m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  if (xor) {
+    while (d8 != dtop) {
+      b = _mm_load_si128((__m128i *) s8);
+      result = _mm_clmulepi64_si128 (b, v, 0);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      fr = _mm_xor_si128 (result, w);
+      fr = _mm_and_si128 (fr, m);
+
+      result = _mm_clmulepi64_si128 (b, v, 1);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      result = _mm_slli_si128 (result, 8);
+      fr = _mm_xor_si128 (result, fr);
+      result = _mm_load_si128((__m128i *) d8);
+      fr = _mm_xor_si128 (result, fr);
+
+      _mm_store_si128((__m128i *) d8, fr);
+      d8 += 16;
+      s8 += 16;
+    }
+  } else {
+    while (d8 < dtop) {
+      b = _mm_load_si128((__m128i *) s8);
+      result = _mm_clmulepi64_si128 (b, v, 0);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      fr = _mm_xor_si128 (result, w);
+      fr = _mm_and_si128 (fr, m);
+  
+      result = _mm_clmulepi64_si128 (b, v, 1);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      result = _mm_slli_si128 (result, 8);
+      fr = _mm_xor_si128 (result, fr);
+  
+      _mm_store_si128((__m128i *) d8, fr);
+      d8 += 16;
+      s8 += 16;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+void
+gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_4_64_lazy_data *ld;
+  int i, j, k;
+  uint64_t pp, v, s, *s64, *d64, *top;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 16; i++) {
+      ld->tables[i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  }
+  ld->last_value = val;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  while (d64 != top) {
+    v = (xor) ? *d64 : 0;
+    s = *s64;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&0xf];
+      s >>= 4;
+      i++;
+    }
+    *d64 = v;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+uint64_t
+gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64)
+{
+  uint64_t product, i, j, mask, tb;
+  gf_internal_t *h;
+  struct gf_split_8_8_data *d8;
+ 
+  h = (gf_internal_t *) gf->scratch;
+  d8 = (struct gf_split_8_8_data *) h->private;
+  product = 0;
+  mask = 0xff;
+
+  for (i = 0; a64 != 0; i++) {
+    tb = b64;
+    for (j = 0; tb != 0; j++) {
+      product ^= d8->tables[i+j][a64&mask][tb&mask];
+      tb >>= 8;
+    }
+    a64 >>= 8;
+  }
+  return product;
+}
+
+void
+gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_8_64_lazy_data *ld;
+  int i, j, k;
+  uint64_t pp, v, s, *s64, *d64, *top;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_8_64_lazy_data *) h->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 8; i++) {
+      ld->tables[i][0] = 0;
+      for (j = 1; j < 256; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  }
+  ld->last_value = val;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  while (d64 != top) {
+    v = (xor) ? *d64 : 0;
+    s = *s64;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&0xff];
+      s >>= 8;
+      i++;
+    }
+    *d64 = v;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+void
+gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_16_64_lazy_data *ld;
+  int i, j, k;
+  uint64_t pp, v, s, *s64, *d64, *top;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_16_64_lazy_data *) h->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 4; i++) {
+      ld->tables[i][0] = 0;
+      for (j = 1; j < (1<<16); j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  }
+  ld->last_value = val;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  while (d64 != top) {
+    v = (xor) ? *d64 : 0;
+    s = *s64;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&0xffff];
+      s >>= 16;
+      i++;
+    }
+    *d64 = v;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static 
+int gf_w64_shift_init(gf_t *gf)
+{
+  gf->multiply.w64 = gf_w64_shift_multiply;
+  gf->inverse.w64 = gf_w64_euclid;
+  gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+  return 1;
+}
+
+static 
+int gf_w64_cfm_init(gf_t *gf)
+{
+  gf->inverse.w64 = gf_w64_euclid;
+  gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+
+#if defined(INTEL_SSE4_PCLMUL) 
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
+    gf->multiply.w64 = gf_w64_clm_multiply_2;
+    gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; 
+  }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+    gf->multiply.w64 = gf_w64_clm_multiply_4;
+    gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
+  } else {
+    return 0;
+  }
+  return 1;
+#endif
+
+  return 0;
+}
+
+static
+void
+gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
+{
+  uint64_t i;
+  uint64_t j;
+  uint64_t one = 1;
+  int g_s;
+
+  g_s = h->arg1;
+  shift[0] = 0;
+ 
+  for (i = 1; i < ((uint64_t)1 << g_s); i <<= 1) {
+    for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
+    if (val & (one << 63)) {
+      val <<= 1;
+      val ^= h->prim_poly;
+    } else {
+      val <<= 1;
+    }
+  }
+}
+
+static
+inline
+gf_val_64_t
+gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  uint64_t top, bot, mask, tp;
+  int g_s, g_r, lshift, rshift;
+  struct gf_w64_group_data *gd;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+  g_r = h->arg2;
+  gd = (struct gf_w64_group_data *) h->private;
+  gf_w64_group_set_shift_tables(gd->shift, b, h);
+
+  mask = (((uint64_t)1 << g_s) - 1);
+  top = 0;
+  bot = gd->shift[a&mask];
+  a >>= g_s; 
+
+  if (a == 0) return bot;
+  lshift = 0;
+  rshift = 64;
+
+  do {              /* Shifting out is straightfoward */
+    lshift += g_s;
+    rshift -= g_s;
+    tp = gd->shift[a&mask];
+    top ^= (tp >> rshift);
+    bot ^= (tp << lshift);
+    a >>= g_s; 
+  } while (a != 0);
+
+  /* Reducing is a bit gross, because I don't zero out the index bits of top.
+     The reason is that we throw top away.  Even better, that last (tp >> rshift)
+     is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */
+     
+  lshift = ((lshift-1) / g_r) * g_r;
+  rshift = 64 - lshift;
+  mask = ((uint64_t)1 << g_r) - 1;
+  while (lshift >= 0) {
+    tp = gd->reduce[(top >> lshift) & mask];
+    top ^= (tp >> rshift);
+    bot ^= (tp << lshift);
+    lshift -= g_r;
+    rshift += g_r;
+  }
+    
+  return bot;
+}
+
+static
+void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  int i, fzb;
+  uint64_t a64, smask, rmask, top, bot, tp;
+  int lshift, rshift, g_s, g_r;
+  gf_region_data rd;
+  uint64_t *s64, *d64, *dtop;
+  struct gf_w64_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gd = (struct gf_w64_group_data *) h->private;
+  g_s = h->arg1;
+  g_r = h->arg2;
+  gf_w64_group_set_shift_tables(gd->shift, val, h);
+
+  for (i = 63; !(val & (1ULL << i)); i--) ;
+  i += g_s;
+  
+  /* i is the bit position of the first zero bit in any element of
+                           gd->shift[] */
+  
+  if (i > 64) i = 64;   
+  
+  fzb = i;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  dtop = (uint64_t *) rd.d_top;
+
+  smask = ((uint64_t)1 << g_s) - 1;
+  rmask = ((uint64_t)1 << g_r) - 1;
+
+  while (d64 < dtop) {
+    a64 = *s64;
+    
+    top = 0;
+    bot = gd->shift[a64&smask];
+    a64 >>= g_s;
+    i = fzb;
+
+    if (a64 != 0) {
+      lshift = 0;
+      rshift = 64;
+  
+      do {  
+        lshift += g_s;
+        rshift -= g_s;
+        tp = gd->shift[a64&smask];
+        top ^= (tp >> rshift);
+        bot ^= (tp << lshift);
+        a64 >>= g_s;
+      } while (a64 != 0);
+      i += lshift;
+  
+      lshift = ((i-64-1) / g_r) * g_r;
+      rshift = 64 - lshift;
+      while (lshift >= 0) {
+        tp = gd->reduce[(top >> lshift) & rmask];
+        top ^= (tp >> rshift);    
+        bot ^= (tp << lshift);
+        lshift -= g_r;
+        rshift += g_r;
+      }
+    }
+
+    if (xor) bot ^= *d64;
+    *d64 = bot;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+gf_val_64_t
+gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  int leftover, rs;
+  uint64_t p, l, ind, a64;
+  int bits_left;
+  int g_s;
+
+  struct gf_w64_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+
+  gd = (struct gf_w64_group_data *) h->private;
+  gf_w64_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = 64 % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  rs = 64 - leftover;
+  a64 = a;
+  ind = a64 >> rs;
+  a64 <<= leftover;
+  p = gd->shift[ind];
+
+  bits_left = rs;
+  rs = 64 - g_s;
+
+  while (bits_left > 0) {
+    bits_left -= g_s;
+    ind = a64 >> rs;
+    a64 <<= g_s;
+    l = p >> rs;
+    p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
+  }
+  return p;
+}
+
+static
+void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  int leftover, rs;
+  uint64_t p, l, ind, a64;
+  int bits_left;
+  int g_s;
+  gf_region_data rd;
+  uint64_t *s64, *d64, *top;
+  struct gf_w64_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gd = (struct gf_w64_group_data *) h->private;
+  g_s = h->arg1;
+  gf_w64_group_set_shift_tables(gd->shift, val, h);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  leftover = 64 % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  while (d64 < top) {
+    rs = 64 - leftover;
+    a64 = *s64;
+    ind = a64 >> rs;
+    a64 <<= leftover;
+    p = gd->shift[ind];
+
+    bits_left = rs;
+    rs = 64 - g_s;
+
+    while (bits_left > 0) {
+      bits_left -= g_s;
+      ind = a64 >> rs;
+      a64 <<= g_s;
+      l = p >> rs;
+      p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
+    }
+    if (xor) p ^= *d64;
+    *d64 = p;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+
+static
+int gf_w64_group_init(gf_t *gf)
+{
+  uint64_t i, j, p, index;
+  struct gf_w64_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint64_t g_r, g_s;
+
+  g_s = h->arg1;
+  g_r = h->arg2;
+
+  gd = (struct gf_w64_group_data *) h->private;
+  gd->shift = (uint64_t *) (&(gd->memory));
+  gd->reduce = gd->shift + (1 << g_s);
+
+  gd->reduce[0] = 0;
+  for (i = 0; i < ((uint64_t)1 << g_r); i++) {
+    p = 0;
+    index = 0;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        p ^= (h->prim_poly << j);
+        index ^= (1 << j);
+        if (j > 0) index ^= (h->prim_poly >> (64-j)); 
+      }
+    }
+    gd->reduce[index] = p;
+  }
+
+  if (g_s == g_r) {
+    gf->multiply.w64 = gf_w64_group_s_equals_r_multiply;
+    gf->multiply_region.w64 = gf_w64_group_s_equals_r_multiply_region; 
+  } else {
+    gf->multiply.w64 = gf_w64_group_multiply;
+    gf->multiply_region.w64 = gf_w64_group_multiply_region; 
+  }
+  gf->divide.w64 = NULL;
+  gf->inverse.w64 = gf_w64_euclid;
+
+  return 1;
+}
+
+static
+gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint64_t *r64, rv;
+
+  r64 = (uint64_t *) start;
+  rv = r64[index];
+  return rv;
+}
+
+static
+gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint64_t a, b, *r64;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r64 = (uint64_t *) start;
+  if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
+  if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
+  index -= (((uint64_t *) rd.d_start) - r64);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | ((uint64_t)b << 32));
+}
+
+static
+gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int i;
+  uint64_t *r64, rv;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128);
+  r64 = (uint64_t *) start;
+  if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
+  if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
+  index -= (((uint64_t *) rd.d_start) - r64);
+  r8 = (uint8_t *) rd.d_start;
+  r8 += ((index & 0xfffffff0)*8);
+  r8 += (index & 0xf);
+  r8 += 112;
+  rv =0;
+  for (i = 0; i < 8; i++) {
+    rv <<= 8;
+    rv |= *r8;
+    r8 -= 16;
+  }
+  return rv;
+}
+
+static
+inline
+gf_val_64_t
+gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  uint64_t prod, pp, bmask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x8000000000000000ULL;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+inline
+gf_val_64_t
+gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  uint64_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  
+  /* changed from declare then shift to just declare.*/
+  
+  pmask = 0x8000000000000000ULL;
+  amask = 0x8000000000000000ULL;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+void
+gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, ta, prod, amask, pmask, pp;
+  gf_region_data rd;
+  gf_internal_t *h;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  pmask = 0x80000000;
+  pmask <<= 32;
+  pp = h->prim_poly;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = pmask;
+      ta = *s64;
+      while (amask != 0) {
+        prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = pmask;
+      ta = *s64;
+      while (amask != 0) {
+        prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, ta, tb, prod, bmask, pp;
+  gf_region_data rd;
+  gf_internal_t *h;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  bmask = 0x80000000;
+  bmask <<= 32;
+  pp = h->prim_poly;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      tb = val;
+      ta = *s64;
+      while (1) {
+        if (tb & 1) prod ^= ta;
+        tb >>= 1;
+        if (tb == 0) break;
+        ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      tb = val;
+      ta = *s64;
+      while (1) {
+        if (tb & 1) prod ^= ta;
+        tb >>= 1;
+        if (tb == 0) break;
+        ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, m2); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+#define BYTWO_P_ONESTEP {\
+      SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+      t1 = _mm_and_si128(v, one); \
+      t1 = _mm_sub_epi64(t1, one); \
+      t1 = _mm_and_si128(t1, ta); \
+      prod = _mm_xor_si128(prod, t1); \
+      v = _mm_srli_epi64(v, 1); }
+
+
+void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSE2
+  int i;
+  uint8_t *s8, *d8;
+  uint64_t vrev, one64;
+  uint64_t amask;
+  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
+  gf_region_data rd;
+  gf_internal_t *h;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+  one64 = 1;
+  vrev = 0;
+  for (i = 0; i < 64; i++) {
+    vrev <<= 1;
+    if (!(val & (one64 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  amask = -1;
+  amask ^= 1;
+  pp = _mm_set1_epi64x(h->prim_poly);
+  m1 = _mm_set1_epi64x(amask);
+  m2 = _mm_set1_epi64x(one64 << 63);
+  one = _mm_set1_epi64x(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi64x(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
+{
+  uint64_t one64, amask;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  gf_internal_t *h;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  h = (gf_internal_t *) rd->gf->scratch;
+  one64 = 1;
+  amask = -1;
+  amask ^= 1;
+  pp = _mm_set1_epi64x(h->prim_poly);
+  m1 = _mm_set1_epi64x(amask);
+  m2 = _mm_set1_epi64x(one64 << 63);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
+{
+  uint64_t one64, amask;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va;
+  gf_internal_t *h;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  h = (gf_internal_t *) rd->gf->scratch;
+  one64 = 1;
+  amask = -1;
+  amask ^= 1;
+  pp = _mm_set1_epi64x(h->prim_poly);
+  m1 = _mm_set1_epi64x(amask);
+  m2 = _mm_set1_epi64x(one64 << 63);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  uint64_t itb, amask, one64;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  gf_region_data rd;
+  gf_internal_t *h;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  if (val == 2) {
+    if (xor) {
+      gf_w64_bytwo_b_sse_region_2_xor(&rd);
+    } else {
+      gf_w64_bytwo_b_sse_region_2_noxor(&rd);
+    }
+    gf_do_final_region_alignment(&rd);
+    return;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+  h = (gf_internal_t *) gf->scratch;
+
+  one64 = 1;
+  amask = -1;
+  amask ^= 1;
+  pp = _mm_set1_epi64x(h->prim_poly);
+  m1 = _mm_set1_epi64x(amask);
+  m2 = _mm_set1_epi64x(one64 << 63);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
+    itb = val;
+    while (1) {
+      if (itb & 1) vb = _mm_xor_si128(vb, va);
+      itb >>= 1;
+      if (itb == 0) break;
+      SSE_AB2(pp, m1, m2, va, t1, t2);
+    }
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+
+static
+int gf_w64_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    gf->multiply.w64 = gf_w64_bytwo_p_multiply;
+    #ifdef INTEL_SSE2 
+      if (h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; 
+      else
+        gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; 
+    #else
+      gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; 
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #endif
+  } else {
+    gf->multiply.w64 = gf_w64_bytwo_b_multiply;
+    #ifdef INTEL_SSE2 
+      if (h->region_type & GF_REGION_NOSIMD)
+        gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; 
+      else
+        gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; 
+    #else
+      gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; 
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #endif
+  }
+  gf->inverse.w64 = gf_w64_euclid;
+  return 1;
+}
+
+
+static
+gf_val_64_t
+gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = b & 0x00000000ffffffff;
+  uint32_t b1 = (b & 0xffffffff00000000) >> 32;
+  uint32_t a0 = a & 0x00000000ffffffff;
+  uint32_t a1 = (a & 0xffffffff00000000) >> 32;
+  uint32_t a1b1;
+
+  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+  return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+         ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
+}
+
+/*
+ * Composite field division trick (explained in 2007 tech report)
+ *
+ * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
+ *
+ * let c = b^-1
+ *
+ * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
+ *
+ * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
+ *
+ * let d = b1c1 and d+1 = b0c0
+ *
+ * solve s*b1c1+b1c0+b0c1 = 0
+ *
+ * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
+ *
+ * c0 = (d+1)b0^-1
+ * c1 = d*b1^-1
+ *
+ * a / b = a * c
+ */
+
+static
+gf_val_64_t
+gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t a0 = a & 0x00000000ffffffff;
+  uint32_t a1 = (a & 0xffffffff00000000) >> 32;
+  uint32_t c0, c1, d, tmp;
+  uint64_t c;
+  uint32_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w32(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    a0inv = base_gf->inverse.w32(base_gf, a0);
+
+    d = base_gf->multiply.w32(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w32(base_gf, tmp);
+
+    d = base_gf->multiply.w32(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w32(base_gf, d, a1inv);
+  }
+
+  c = c0 | ((uint64_t)c1 << 32);
+
+  return c;
+}
+
+static
+void
+gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = val & 0x00000000ffffffff;
+  uint32_t b1 = (val & 0xffffffff00000000) >> 32;
+  uint64_t *s64, *d64;
+  uint64_t *top;
+  uint64_t a0, a1, a1b1;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+
+  s64 = rd.s_start;
+  d64 = rd.d_start;
+  top = rd.d_top;
+  
+  if (xor) {
+    while (d64 < top) {
+      a0 = *s64 & 0x00000000ffffffff;
+      a1 = (*s64 & 0xffffffff00000000) >> 32;
+      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+      *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
+      s64++;
+      d64++;
+    }
+  } else {
+    while (d64 < top) {
+      a0 = *s64 & 0x00000000ffffffff;
+      a1 = (*s64 & 0xffffffff00000000) >> 32;
+      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+      *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
+      s64++;
+      d64++;
+    }
+  }
+}
+
+static
+void
+gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  gf_val_32_t val0 = val & 0x00000000ffffffff;
+  gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32;
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;
+  int sub_reg_size;
+  gf_region_data rd;
+
+  if (!xor) {
+    memset(dest, 0, bytes);
+  }
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t*) rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+
+  base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+
+
+static
+int gf_w64_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    gf->multiply_region.w64 = gf_w64_composite_multiply_region_alt;
+  } else {
+    gf->multiply_region.w64 = gf_w64_composite_multiply_region;
+  }
+
+  gf->multiply.w64 = gf_w64_composite_multiply;
+  gf->divide.w64 = NULL;
+  gf->inverse.w64 = gf_w64_composite_inverse;
+
+  return 1;
+}
+
+#ifdef INTEL_SSSE3
+static
+  void
+gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v, *s64, *d64, *top;
+  __m128i si, tables[16][8], p[8], v0, mask1;
+  struct gf_split_4_64_lazy_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  v = val;
+  for (i = 0; i < 16; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+
+  while (d64 != top) {
+
+    if (xor) {
+      for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
+    } else {
+      for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
+    }
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      v0 = _mm_load_si128((__m128i *) s64); 
+      /* MM_PRINT8("v", v0); */
+      s64 += 2;
+      
+      si = _mm_and_si128(v0, mask1);
+  
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+    }
+    for (i = 0; i < 8; i++) {
+      /* MM_PRINT8("v", p[i]); */
+      _mm_store_si128((__m128i *) d64, p[i]);
+      d64 += 2;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#ifdef INTEL_SSE4
+static
+  void
+gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v, *s64, *d64, *top;
+  __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1;
+  struct gf_split_4_64_lazy_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  v = val;
+  for (i = 0; i < 16; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+  mask8 = _mm_set1_epi16(0xff);
+  mask16 = _mm_set1_epi32(0xffff);
+
+  while (d64 != top) {
+
+    for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
+
+    for (k = 0; k < 8; k++) {
+      st[k]  = _mm_load_si128((__m128i *) s64); 
+      s64 += 2;
+    }
+
+    for (k = 0; k < 4; k ++) {
+      st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0));
+      st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1));
+      t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0);
+      st[k] = _mm_srli_si128(st[k], 8);
+      st[k+4] = _mm_slli_si128(st[k+4], 8);
+      st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0);
+      st[k] = t1;
+    }
+
+/*
+    printf("After pack pass 1\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+    printf("\n");
+ */
+    
+    t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16));
+    st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16));
+    st[0] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16));
+    st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16));
+    st[1] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16));
+    st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16));
+    st[4] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16));
+    st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16));
+    st[5] = t1;
+
+/*
+    printf("After pack pass 2\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+    printf("\n");
+ */
+    t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8));
+    st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8));
+    st[0] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8));
+    st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8));
+    st[2] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8));
+    st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8));
+    st[4] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8));
+    st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8));
+    st[6] = t1;
+
+/*
+    printf("After final pack pass 2\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+ */
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      si = _mm_and_si128(st[k], mask1);
+  
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+      st[k] = _mm_srli_epi32(st[k], 4);
+      si = _mm_and_si128(st[k], mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+    }
+
+    t1 = _mm_unpacklo_epi8(p[0], p[1]);
+    p[1] = _mm_unpackhi_epi8(p[0], p[1]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi8(p[2], p[3]);
+    p[3] = _mm_unpackhi_epi8(p[2], p[3]);
+    p[2] = t1;
+    t1 = _mm_unpacklo_epi8(p[4], p[5]);
+    p[5] = _mm_unpackhi_epi8(p[4], p[5]);
+    p[4] = t1;
+    t1 = _mm_unpacklo_epi8(p[6], p[7]);
+    p[7] = _mm_unpackhi_epi8(p[6], p[7]);
+    p[6] = t1;
+
+/*
+    printf("After unpack pass 1:\n");
+    for (i = 0; i < 8; i++) {
+      MM_PRINT8("v", p[i]);
+    }
+ */
+
+    t1 = _mm_unpacklo_epi16(p[0], p[2]);
+    p[2] = _mm_unpackhi_epi16(p[0], p[2]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi16(p[1], p[3]);
+    p[3] = _mm_unpackhi_epi16(p[1], p[3]);
+    p[1] = t1;
+    t1 = _mm_unpacklo_epi16(p[4], p[6]);
+    p[6] = _mm_unpackhi_epi16(p[4], p[6]);
+    p[4] = t1;
+    t1 = _mm_unpacklo_epi16(p[5], p[7]);
+    p[7] = _mm_unpackhi_epi16(p[5], p[7]);
+    p[5] = t1;
+
+/*
+    printf("After unpack pass 2:\n");
+    for (i = 0; i < 8; i++) {
+      MM_PRINT8("v", p[i]);
+    }
+ */
+
+    t1 = _mm_unpacklo_epi32(p[0], p[4]);
+    p[4] = _mm_unpackhi_epi32(p[0], p[4]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi32(p[1], p[5]);
+    p[5] = _mm_unpackhi_epi32(p[1], p[5]);
+    p[1] = t1;
+    t1 = _mm_unpacklo_epi32(p[2], p[6]);
+    p[6] = _mm_unpackhi_epi32(p[2], p[6]);
+    p[2] = t1;
+    t1 = _mm_unpacklo_epi32(p[3], p[7]);
+    p[7] = _mm_unpackhi_epi32(p[3], p[7]);
+    p[3] = t1;
+
+    if (xor) {
+      for (i = 0; i < 8; i++) {
+        t1 = _mm_load_si128((__m128i *) d64);
+        _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1));
+        d64 += 2;
+      }
+    } else {
+      for (i = 0; i < 8; i++) {
+        _mm_store_si128((__m128i *) d64, p[i]);
+        d64 += 2;
+      }
+    }
+
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
+
+static
+int gf_w64_split_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_split_4_64_lazy_data *d4;
+  struct gf_split_8_64_lazy_data *d8;
+  struct gf_split_8_8_data *d88;
+  struct gf_split_16_64_lazy_data *d16;
+  uint64_t p, basep;
+  int exp, i, j;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /* Defaults */
+
+  gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+
+  gf->multiply.w64 = gf_w64_bytwo_p_multiply; 
+
+#if defined(INTEL_SSE4_PCLMUL) 
+  if ((!(h->region_type & GF_REGION_NOSIMD) &&
+     (h->arg1 == 64 || h->arg2 == 64)) ||
+     h->mult_type == GF_MULT_DEFAULT){
+   
+    if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
+      gf->multiply.w64 = gf_w64_clm_multiply_2;
+      gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; 
+    }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+      gf->multiply.w64 = gf_w64_clm_multiply_4;
+      gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; 
+    }else{
+      return 0;
+    }
+  }
+#endif
+
+  gf->inverse.w64 = gf_w64_euclid;
+
+  /* Allen: set region pointers for default mult type. Single pointers are
+   * taken care of above (explicitly for sse, implicitly for no sse). */
+
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    d4 = (struct gf_split_4_64_lazy_data *) h->private;
+    d4->last_value = 0;
+#if defined(INTEL_SSE4)
+    gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; 
+#elif defined(ARCH_AARCH64)
+    gf_w64_neon_split_init(gf);
+#endif
+  }
+#else
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    d8 = (struct gf_split_8_64_lazy_data *) h->private;
+    d8->last_value = 0;
+    gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
+  }
+#endif
+
+  if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
+    d4 = (struct gf_split_4_64_lazy_data *) h->private;
+    d4->last_value = 0;
+
+    if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0;
+    if(h->region_type & GF_REGION_ALTMAP)
+    {
+      #ifdef INTEL_SSSE3
+        gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; 
+      #elif defined(ARCH_AARCH64)
+        gf_w64_neon_split_init(gf);
+      #else
+        return 0;
+      #endif
+    }
+    else //no altmap
+    {
+      #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+        if(h->region_type & GF_REGION_NOSIMD)
+          gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+        else
+        #if defined(INTEL_SSE4)
+          gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
+        #elif defined(ARCH_AARCH64)
+          gf_w64_neon_split_init(gf);
+        #endif
+      #else
+        gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+        if(h->region_type & GF_REGION_SIMD)
+          return 0;
+      #endif
+    }
+  }
+  if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) {
+    d8 = (struct gf_split_8_64_lazy_data *) h->private;
+    d8->last_value = 0;
+    gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
+  }
+  if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) {
+    d16 = (struct gf_split_16_64_lazy_data *) h->private;
+    d16->last_value = 0;
+    gf->multiply_region.w64 = gf_w64_split_16_64_lazy_multiply_region;
+  }
+  if ((h->arg1 == 8 && h->arg2 == 8)) {
+    d88 = (struct gf_split_8_8_data *) h->private;
+    gf->multiply.w64 = gf_w64_split_8_8_multiply;
+
+    /* The performance of this guy sucks, so don't bother with a region op */
+    
+    basep = 1;
+    for (exp = 0; exp < 15; exp++) {
+      for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0;
+      for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0;
+      d88->tables[exp][1][1] = basep;
+      for (i = 2; i < 256; i++) {
+        if (i&1) {
+          p = d88->tables[exp][i^1][1];
+          d88->tables[exp][i][1] = p ^ basep;
+        } else {
+          p = d88->tables[exp][i>>1][1];
+          d88->tables[exp][i][1] = GF_MULTBY_TWO(p);
+        }
+      }
+      for (i = 1; i < 256; i++) {
+        p = d88->tables[exp][i][1];
+        for (j = 1; j < 256; j++) {
+          if (j&1) {
+            d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p;
+          } else {
+            d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]);
+          }
+        }
+      }
+      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
+    }
+  }
+  return 1;
+}
+
+int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  switch(mult_type)
+  {
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t);
+      break;
+
+    case GF_MULT_DEFAULT:
+
+      /* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
+       * then fall through to split table scratch size code. */
+
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+      arg1 = 64;
+      arg2 = 4;
+#else
+      arg1 = 64;
+      arg2 = 8;
+#endif
+
+    case GF_MULT_SPLIT_TABLE:
+        if (arg1 == 8 && arg2 == 8) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
+        }
+        if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64;
+        }
+        if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64;
+        }
+
+        if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
+        }
+        return 0;
+    case GF_MULT_GROUP:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) +
+               sizeof(uint64_t) * (1 << arg1) +
+               sizeof(uint64_t) * (1 << arg2) + 64;
+      break;
+    case GF_MULT_COMPOSITE:
+      if (arg1 == 2) return sizeof(gf_internal_t) + 64;
+      return 0;
+      break;
+    default:
+      return 0;
+   }
+}
+
+int gf_w64_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  int no_default_flag = 0;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  /* Omitting the leftmost 1 as in w=32 */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else {
+      h->prim_poly = 0x1b;
+    } 
+    if (no_default_flag == 1) { 
+      fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n"); 
+      return 0; 
+    } 
+  }
+
+  gf->multiply.w64 = NULL;
+  gf->divide.w64 = NULL;
+  gf->inverse.w64 = NULL;
+  gf->multiply_region.w64 = NULL;
+
+  switch(h->mult_type) {
+    case GF_MULT_CARRY_FREE:  if (gf_w64_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:       if (gf_w64_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:   if (gf_w64_composite_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT:
+    case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; 
+    case GF_MULT_GROUP:       if (gf_w64_group_init(gf) == 0) return 0; break; 
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:     if (gf_w64_bytwo_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    gf->divide.w64 = gf_w64_divide_from_inverse;
+    gf->inverse.w64 = gf_w64_euclid;
+  } 
+
+  if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) {
+    gf->divide.w64 = gf_w64_divide_from_inverse;
+  }
+  if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) {
+    gf->inverse.w64 = gf_w64_inverse_from_divide;
+  }
+
+  if (h->region_type == GF_REGION_CAUCHY) return 0;
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      gf->extract_word.w64 = gf_w64_composite_extract_word;
+    } else if (h->mult_type == GF_MULT_SPLIT_TABLE) {
+      gf->extract_word.w64 = gf_w64_split_extract_word;
+    }
+  } else {
+    gf->extract_word.w64 = gf_w64_extract_word;
+  }
+
+  return 1;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w8.c b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c
new file mode 100644
index 0000000..276799f
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c
@@ -0,0 +1,2392 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w8.c
+ *
+ * Routines for 8-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include "gf_w8.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define AB2(ip, am1 ,am2, b, t1, t2) {\
+  t1 = (b << 1) & am1;\
+  t2 = b & am2; \
+  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
+  b = (t1 ^ (t2 & ip));}
+
+#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, m2); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf("  %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
+
+static
+inline
+uint32_t gf_w8_inverse_from_divide (gf_t *gf, uint32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+uint32_t gf_w8_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+inline
+uint32_t gf_w8_euclid (gf_t *gf, uint32_t b)
+{
+  uint32_t e_i, e_im1, e_ip1;
+  uint32_t d_i, d_im1, d_ip1;
+  uint32_t y_i, y_im1, y_ip1;
+  uint32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = 8;
+  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+static
+gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint8_t *r8;
+
+  r8 = (uint8_t *) start;
+  return r8[index];
+}
+
+static
+gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint8_t a, b;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r8 = (uint8_t *) start;
+  if (r8 + index < (uint8_t *) rd.d_start) return r8[index];
+  if (r8 + index >= (uint8_t *) rd.d_top) return r8[index];
+  index -= (((uint8_t *) rd.d_start) - r8);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | (b << 4));
+}
+
+static
+inline
+uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
+{
+  return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_si128 shifts the result to the right by 1 byte. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+  return rv;
+}
+
+
+static
+void
+gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 ^= gf->multiply.w32(gf, val, *s8);
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 = gf->multiply.w32(gf, val, *s8);
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: SHIFT:
+
+JSP: The world's dumbest multiplication algorithm.  I only
+include it for completeness.  It does have the feature that it requires no
+extra memory.  
+ */
+
+static
+inline
+  uint32_t
+gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
+{
+  uint16_t product, i, pp, a, b;
+  gf_internal_t *h;
+
+  a = a8;
+  b = b8;
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (1 << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+static 
+int gf_w8_cfm_init(gf_t *gf)
+{ 
+#if defined(INTEL_SSE4_PCLMUL)
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+    if ((0xe0 & h->prim_poly) == 0){
+      gf->multiply.w32 = gf_w8_clm_multiply_2;
+      gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2;
+    }else if ((0xc0 & h->prim_poly) == 0){
+      gf->multiply.w32 = gf_w8_clm_multiply_3;
+      gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_3;
+    }else if ((0x80 & h->prim_poly) == 0){ 
+      gf->multiply.w32 = gf_w8_clm_multiply_4;
+      gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_4;
+    }else{
+      return 0;
+    }
+  return 1;
+#elif defined(ARM_NEON)
+  return gf_w8_neon_cfm_init(gf);
+#endif
+
+  return 0;
+
+}
+
+static 
+int gf_w8_shift_init(gf_t *gf)
+{ 
+  gf->multiply.w32 = gf_w8_shift_multiply;  /* The others will be set automatically */
+  return 1;
+}
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: LOG_TABLE:
+
+JSP: Kevin wrote this, and I'm converting it to my structure.
+*/
+
+static
+inline
+  uint32_t
+gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logzero_table_data *ltd;
+
+  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
+}
+
+static
+inline
+  uint32_t
+gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logzero_table_data *ltd;
+
+  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return ltd->div_tbl[ltd->log_tbl[a] - ltd->log_tbl[b]];
+}
+
+static
+inline
+  uint32_t
+gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logzero_small_table_data *std;
+
+  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  if (b == 0) return 0;
+  return std->antilog_tbl[std->log_tbl[a] + std->log_tbl[b]];
+}
+
+static
+inline
+  uint32_t
+gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logzero_small_table_data *std;
+
+  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return std->div_tbl[std->log_tbl[a] - std->log_tbl[b]];
+}
+
+static
+inline
+  uint32_t
+gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logtable_data *ltd;
+
+  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
+}
+
+static
+inline
+  uint32_t
+gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
+{
+  int log_sum = 0;
+  struct gf_w8_logtable_data *ltd;
+
+  if (a == 0 || b == 0) return 0;
+  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+
+  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
+  return (ltd->antilog_tbl[log_sum]);
+}
+
+static
+  uint32_t
+gf_w8_log_inverse (gf_t *gf, uint32_t a)
+{
+  struct gf_w8_logtable_data *ltd;
+
+  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ltd->inv_tbl[a]);
+}
+
+static
+  uint32_t
+gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
+{
+  struct gf_w8_logzero_table_data *ltd;
+
+  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ltd->inv_tbl[a]);
+}
+
+static
+  uint32_t
+gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
+{
+  struct gf_w8_logzero_small_table_data *std;
+
+  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (std->inv_tbl[a]);
+}
+
+static
+  void
+gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t lv;
+  uint8_t *s8, *d8;
+  struct gf_w8_logtable_data *ltd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  lv = ltd->log_tbl[val];
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d8[i] ^= (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d8[i] = (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
+    }
+  }
+}
+
+static
+  void
+gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t lv;
+  uint8_t *s8, *d8;
+  struct gf_w8_logzero_table_data *ltd;
+  struct gf_w8_logzero_small_table_data *std;
+  short *log;
+  uint8_t *alt;
+  gf_internal_t *h;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->arg1 == 1) {
+    std = (struct gf_w8_logzero_small_table_data *) h->private;
+    log = std->log_tbl;
+    alt = std->antilog_tbl;
+  } else {
+    ltd = (struct gf_w8_logzero_table_data *) h->private;
+    log = ltd->log_tbl;
+    alt = ltd->antilog_tbl;
+  }
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  lv = log[val];
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d8[i] ^= (alt[lv + log[s8[i]]]);
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d8[i] = (alt[lv + log[s8[i]]]);
+    }
+  }
+}
+
+  static
+int gf_w8_log_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_logtable_data *ltd = NULL;
+  struct gf_w8_logzero_table_data *ztd = NULL;
+  struct gf_w8_logzero_small_table_data *std = NULL;
+  uint8_t *alt;
+  uint8_t *inv;
+  int i, b;
+  int check = 0;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
+    ltd = h->private;
+    alt = ltd->antilog_tbl;
+    inv = ltd->inv_tbl;
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+    std = h->private;
+    alt = std->antilog_tbl;
+    std->div_tbl = (alt + 255);
+    inv = std->inv_tbl;
+  } else {
+    ztd = h->private;
+    alt = ztd->antilog_tbl;
+    ztd->inv_tbl = (alt + 512 + 256);
+    ztd->div_tbl = (alt + 255);
+    inv = ztd->inv_tbl;
+  }
+
+  for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) {
+    if (h->mult_type == GF_MULT_LOG_TABLE)
+      ltd->log_tbl[i] = 0;
+    else if (h->mult_type == GF_MULT_LOG_ZERO)
+      std->log_tbl[i] = 0;
+    else
+      ztd->log_tbl[i] = 0;
+  }
+
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
+    ltd->log_tbl[0] = 0;
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+    std->log_tbl[0] = 510;
+  } else {
+    ztd->log_tbl[0] = 512;
+  }
+
+  b = 1;
+  for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
+    if (h->mult_type == GF_MULT_LOG_TABLE) {
+      if (ltd->log_tbl[b] != 0) check = 1;
+      ltd->log_tbl[b] = i;
+    } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+      if (std->log_tbl[b] != 0) check = 1;
+      std->log_tbl[b] = i;
+    } else {
+      if (ztd->log_tbl[b] != 0) check = 1;
+      ztd->log_tbl[b] = i;
+    }
+    alt[i] = b;
+    alt[i+GF_MULT_GROUP_SIZE] = b;
+    b <<= 1;
+    if (b & GF_FIELD_SIZE) {
+      b = b ^ h->prim_poly;
+    }
+  }
+  if (check) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
+  if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255);
+
+  if (h->mult_type == GF_MULT_LOG_ZERO_EXT) {
+    bzero(alt+512, 255);
+    alt[512+512] = 0;
+  }
+
+  inv[0] = 0;  /* Not really, but we need to fill it with something  */
+  i = 1;
+  b = GF_MULT_GROUP_SIZE;
+  do {
+    inv[i] = alt[b];
+    i <<= 1;
+    if (i & (1 << 8)) i ^= h->prim_poly;
+    b--;
+  } while (i != 1);
+
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
+    gf->inverse.w32 = gf_w8_log_inverse;
+    gf->divide.w32 = gf_w8_log_divide;
+    gf->multiply.w32 = gf_w8_log_multiply;
+    gf->multiply_region.w32 = gf_w8_log_multiply_region;
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+    gf->inverse.w32 = gf_w8_logzero_small_inverse;
+    gf->divide.w32 = gf_w8_logzero_small_divide;
+    gf->multiply.w32 = gf_w8_logzero_small_multiply;
+    gf->multiply_region.w32 = gf_w8_logzero_multiply_region;
+  } else {
+    gf->inverse.w32 = gf_w8_logzero_inverse;
+    gf->divide.w32 = gf_w8_logzero_divide;
+    gf->multiply.w32 = gf_w8_logzero_multiply;
+    gf->multiply_region.w32 = gf_w8_logzero_multiply_region;
+  }
+  return 1;
+}
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: FULL_TABLE:
+
+JSP: Kevin wrote this, and I'm converting it to my structure.
+ */
+
+static
+  gf_val_32_t
+gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_single_table_data *ftd;
+
+  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->multtable[a][b]);
+}
+
+static
+  gf_val_32_t
+gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_single_table_data *ftd;
+
+  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->divtable[a][b]);
+}
+
+static
+  gf_val_32_t
+gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_default_data *ftd;
+
+  ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->multtable[a][b]);
+}
+
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+static
+  gf_val_32_t
+gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_default_data *ftd;
+
+  ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->divtable[a][b]);
+}
+#endif
+
+static
+  gf_val_32_t
+gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_double_table_data *ftd;
+
+  ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->mult[a][b]);
+}
+
+static
+  gf_val_32_t
+gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_double_table_data *ftd;
+
+  ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->div[a][b]);
+}
+
+static
+  void
+gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t *base;
+  uint32_t b, c, vc, vb;
+  gf_internal_t *h;
+  struct gf_w8_double_table_data  *dtd;
+  struct gf_w8_double_table_lazy_data  *ltd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) (gf->scratch);
+  if (h->region_type & GF_REGION_LAZY) {
+    ltd = (struct gf_w8_double_table_lazy_data *) h->private;
+    base = ltd->mult;
+    for (b = 0; b < GF_FIELD_SIZE; b++) {
+      vb = (ltd->smult[val][b] << 8);
+      for (c = 0; c < GF_FIELD_SIZE; c++) {
+        vc = ltd->smult[val][c];
+        base[(b << 8)| c] = (vb | vc);
+      }
+    }
+
+  } else {
+    dtd = (struct gf_w8_double_table_data *) h->private;
+    base = &(dtd->mult[val][0]);
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+  gf_two_byte_region_table_multiply(&rd, base);
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+  gf_val_32_t
+gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_double_table_lazy_data *ftd;
+
+  ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->smult[a][b]);
+}
+
+static
+  gf_val_32_t
+gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_double_table_lazy_data *ftd;
+
+  ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->div[a][b]);
+}
+
+static
+  void
+gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  struct gf_w8_single_table_data *ftd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d8[i] ^= ftd->multtable[s8[i]][val];
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d8[i] = ftd->multtable[s8[i]][val];
+    }
+  }
+}
+
+#ifdef INTEL_SSSE3
+static
+  void
+gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint8_t *bh, *bl, *sptr, *dptr;
+  __m128i  loset, t1, r, va, mth, mtl;
+  struct gf_w8_half_table_data *htd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  bh = (uint8_t *) htd->high;
+  bh += (val << 4);
+  bl = (uint8_t *) htd->low;
+  bl += (val << 4);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+
+  mth = _mm_loadu_si128 ((__m128i *)(bh));
+  mtl = _mm_loadu_si128 ((__m128i *)(bl));
+  loset = _mm_set1_epi8 (0x0f);
+
+  if (xor) {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = _mm_load_si128 ((__m128i *)(sptr));
+      t1 = _mm_and_si128 (loset, va);
+      r = _mm_shuffle_epi8 (mtl, t1);
+      va = _mm_srli_epi64 (va, 4);
+      t1 = _mm_and_si128 (loset, va);
+      r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
+      va = _mm_load_si128 ((__m128i *)(dptr));
+      r = _mm_xor_si128 (r, va);
+      _mm_store_si128 ((__m128i *)(dptr), r);
+      dptr += 16;
+      sptr += 16;
+    }
+  } else {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = _mm_load_si128 ((__m128i *)(sptr));
+      t1 = _mm_and_si128 (loset, va);
+      r = _mm_shuffle_epi8 (mtl, t1);
+      va = _mm_srli_epi64 (va, 4);
+      t1 = _mm_and_si128 (loset, va);
+      r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
+      _mm_store_si128 ((__m128i *)(dptr), r);
+      dptr += 16;
+      sptr += 16;
+    }
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: FULL_TABLE:
+ */
+
+static
+  gf_val_32_t
+gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_half_table_data *htd;
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
+
+  return htd->high[b][a>>4] ^ htd->low[b][a&0xf];
+}
+
+static
+  void
+gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  struct gf_w8_half_table_data *htd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d8[i] ^= (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d8[i] = (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
+    }
+  }
+}
+
+
+  static
+int gf_w8_split_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_half_table_data *htd;
+  int a, b;
+
+  h = (gf_internal_t *) gf->scratch;
+  htd = (struct gf_w8_half_table_data *)h->private;
+
+  bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
+  bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_HALF_SIZE; b++) {
+      htd->low[a][b] = gf_w8_shift_multiply(gf,a,b);
+      htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4);
+    }
+  }
+
+  gf->multiply.w32 = gf_w8_split_multiply;
+  
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    if (h->region_type & GF_REGION_NOSIMD)
+      gf->multiply_region.w32 = gf_w8_split_multiply_region;
+    else
+    #if defined(INTEL_SSSE3)
+      gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+    #elif defined(ARM_NEON)
+      gf_w8_neon_split_init(gf);
+    #endif
+  #else
+    gf->multiply_region.w32 = gf_w8_split_multiply_region;
+    if(h->region_type & GF_REGION_SIMD)
+      return 0;
+  #endif
+
+  return 1;
+}
+
+/* JSP: This is disgusting, but it is what it is.  If there is no SSE,
+   then the default is equivalent to single table.  If there is SSE, then
+   we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */
+   
+static
+int gf_w8_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_single_table_data *ftd = NULL;
+  struct gf_w8_double_table_data *dtd = NULL;
+  struct gf_w8_double_table_lazy_data *ltd = NULL;
+  struct gf_w8_default_data *dd = NULL;
+  int a, b, c, prod, scase, use_simd;
+
+  h = (gf_internal_t *) gf->scratch;
+
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+  use_simd = 1;
+#else
+  use_simd = 0;
+#endif
+
+  if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
+    dd = (struct gf_w8_default_data *)h->private;
+    scase = 3;
+    bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
+    bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
+    bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  } else if (h->mult_type == GF_MULT_DEFAULT || 
+             h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) {
+    ftd = (struct gf_w8_single_table_data *)h->private;
+    bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    scase = 0;
+  } else if (h->region_type == GF_REGION_DOUBLE_TABLE) {
+    dtd = (struct gf_w8_double_table_data *)h->private;
+    bzero(dtd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    bzero(dtd->mult, sizeof(uint16_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    scase = 1;
+  } else if (h->region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
+    ltd = (struct gf_w8_double_table_lazy_data *)h->private;
+    bzero(ltd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    bzero(ltd->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    scase = 2;
+  } else {
+    fprintf(stderr, "Internal error in gf_w8_table_init\n");
+    assert(0);
+  }
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w8_shift_multiply(gf,a,b);
+      switch (scase) {
+        case 0: 
+          ftd->multtable[a][b] = prod;
+          ftd->divtable[prod][b] = a;
+          break;
+        case 1:
+          dtd->div[prod][b] = a;
+          for (c = 0; c < GF_FIELD_SIZE; c++) {
+            dtd->mult[a][(c<<8)|b] |= prod;
+            dtd->mult[a][(b<<8)|c] |= (prod<<8);
+          }
+          break;
+        case 2:
+          ltd->div[prod][b] = a;
+          ltd->smult[a][b] = prod;
+          break;
+        case 3:
+          dd->multtable[a][b] = prod;
+          dd->divtable[prod][b] = a;
+          if ((b & 0xf) == b) { dd->low[a][b] = prod; }
+          if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; }
+          break;
+      }
+    }
+  }
+
+  gf->inverse.w32 = NULL; /* Will set from divide */
+  switch (scase) {
+    case 0: 
+      gf->divide.w32 = gf_w8_table_divide;
+      gf->multiply.w32 = gf_w8_table_multiply;
+      gf->multiply_region.w32 = gf_w8_table_multiply_region;
+      break;
+    case 1:
+      gf->divide.w32 = gf_w8_double_table_divide;
+      gf->multiply.w32 = gf_w8_double_table_multiply;
+      gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
+      break;
+    case 2:
+      gf->divide.w32 = gf_w8_double_table_lazy_divide;
+      gf->multiply.w32 = gf_w8_double_table_lazy_multiply;
+      gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
+      break;
+    case 3:
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+      gf->divide.w32 = gf_w8_default_divide;
+      gf->multiply.w32 = gf_w8_default_multiply;
+#if defined(INTEL_SSSE3)
+      gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+#elif defined(ARM_NEON)
+      gf_w8_neon_split_init(gf);
+#endif
+#endif
+      break;
+  }
+  return 1;
+}
+
+static
+  void
+gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t val0 = val & 0x0f;
+  uint8_t val1 = (val & 0xf0) >> 4;
+  gf_region_data rd;
+  int sub_reg_size;
+
+  if (val == 0) {
+    if (xor) return;
+    bzero(dest, bytes);
+    return;
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  sub_reg_size = ((uint8_t *)rd.d_top - (uint8_t *)rd.d_start) / 2;
+
+  base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, rd.s_start, (uint8_t *)rd.d_start+sub_reg_size, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+   gf_do_final_region_alignment(&rd);
+}
+
+static
+gf_val_32_t
+gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = b & 0x0f; 
+  uint8_t b1 = (b & 0xf0) >> 4; 
+  uint8_t a0 = a & 0x0f; 
+  uint8_t a1 = (a & 0xf0) >> 4; 
+  uint8_t a1b1;
+
+  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+  return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+          ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+           base_gf->multiply.w32(base_gf, a0, b1) ^ 
+           base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+}
+
+static
+gf_val_32_t
+gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint8_t b0 = b & 0x0f; 
+  uint8_t b1 = (b & 0xf0) >> 4; 
+  uint8_t a0 = a & 0x0f; 
+  uint8_t a1 = (a & 0xf0) >> 4; 
+  uint8_t a1b1, *mt;
+  struct gf_w8_composite_data *cd;
+
+  cd = (struct gf_w8_composite_data *) h->private;
+  mt = cd->mult_table;
+
+  a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+
+  return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+          ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+           GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+           GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+}
+
+/*
+ * Composite field division trick (explained in 2007 tech report) 
+ *
+ * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 
+ * 
+ * let c = b^-1
+ *
+ * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
+ * 
+ * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 
+ *
+ * let d = b1c1 and d+1 = b0c0
+ *
+ * solve s*b1c1+b1c0+b0c1 = 0
+ *
+ * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
+ *
+ * c0 = (d+1)b0^-1
+ * c1 = d*b1^-1
+ * 
+ * a / b = a * c
+ */
+
+static
+gf_val_32_t
+gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t a0 = a & 0x0f; 
+  uint8_t a1 = (a & 0xf0) >> 4; 
+  uint8_t c0, c1, c, d, tmp;
+  uint8_t a0inv, a1inv; 
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w32(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
+    a0inv = base_gf->inverse.w32(base_gf, a0) & 0xf;
+
+    d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf;
+
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf;
+    tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf;
+
+    d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf;
+
+    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf; 
+    c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf; 
+  }
+
+  c = c0 | (c1 << 4);
+
+  return c;
+}
+
+static
+void
+gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = val & 0x0f; 
+  uint8_t b1 = (val & 0xf0) >> 4; 
+  uint8_t *s8;
+  uint8_t *d8; 
+  uint8_t *mt;
+  uint8_t a0, a1, a1b1;
+  struct gf_w8_composite_data *cd;
+
+  cd = (struct gf_w8_composite_data *) h->private;
+
+  if (val == 0) {
+    if (xor) return;
+    bzero(dest, bytes);
+    return;
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+  
+  
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  mt = cd->mult_table;
+  if (mt == NULL) {
+    if (xor) {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+               ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                 base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    } else {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+              ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    }
+  } else {
+    if (xor) {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+  
+        *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+               ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+                 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+                 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    } else {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+  
+        *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+              ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+                GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+                GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+  return;
+}
+
+static
+int gf_w8_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_w8_composite_data *cd;
+
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w8_composite_data *) h->private;
+  cd->mult_table = gf_w4_get_mult_table(h->base_gf);
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    gf->multiply_region.w32 = gf_w8_composite_multiply_region_alt;
+  } else {
+    gf->multiply_region.w32 = gf_w8_composite_multiply_region;
+  }
+
+  if (cd->mult_table == NULL) {
+    gf->multiply.w32 = gf_w8_composite_multiply_recursive;
+  } else {
+    gf->multiply.w32 = gf_w8_composite_multiply_inline;
+  }
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = gf_w8_composite_inverse;
+
+  return 1;
+}
+
+static
+inline
+  gf_val_32_t
+gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+
+  prod = 0;
+  pmask = 0x80;
+  amask = 0x80;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+inline
+  gf_val_32_t
+gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x80;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+  void 
+gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
+  gf_region_data rd;
+  struct gf_w8_bytwo_data *btd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x80;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else { 
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x80;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define BYTWO_P_ONESTEP {\
+  SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+  t1 = _mm_and_si128(v, one); \
+  t1 = _mm_sub_epi8(t1, one); \
+  t1 = _mm_and_si128(t1, ta); \
+  prod = _mm_xor_si128(prod, t1); \
+  v = _mm_srli_epi64(v, 1); }
+
+#ifdef INTEL_SSE2
+static
+  void 
+gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint8_t vrev;
+  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
+  struct gf_w8_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  vrev = 0;
+  for (i = 0; i < 8; i++) {
+    vrev <<= 1;
+    if (!(val & (1 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+  one = _mm_set1_epi8(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi8(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+  void
+gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+  void
+gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+
+#ifdef INTEL_SSE2
+static
+  void 
+gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int itb;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_w8_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  if (val == 2) {
+    if (xor) {
+      gf_w8_bytwo_b_sse_region_2_xor(&rd, btd);
+    } else {
+      gf_w8_bytwo_b_sse_region_2_noxor(&rd, btd);
+    }
+    gf_do_final_region_alignment(&rd);
+    return;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
+    itb = val;
+    while (1) {
+      if (itb & 1) vb = _mm_xor_si128(vb, va);
+      itb >>= 1;
+      if (itb == 0) break;
+      SSE_AB2(pp, m1, m2, va, t1, t2);
+    }
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+  void 
+gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
+  struct gf_w8_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  switch (val) {
+    case 2:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+    case 3:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+    case 4:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+    case 5:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta ^ prod;
+          d64++;
+          s64++;
+        }
+      }
+      break;
+    case 6:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta ^ prod;
+          d64++;
+          s64++;
+        }
+      }
+      break;
+      /*
+         case 7:
+         if (xor) {
+         while (d64 < (uint64_t *) rd.d_top) {
+         ta = *s64;
+         prod = ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         prod ^= ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = ta ^ prod;
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       */
+    case 8:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+      /*
+         case 9:
+         if (xor) {
+         while (d64 < (uint64_t *) rd.d_top) {
+         ta = *s64;
+         prod = ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       case 10:
+       if (xor) {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       case 11:
+       if (xor) {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+      }
+  }
+  break; 
+    case 12:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  }
+  break; 
+    case 13:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  }
+  break; 
+    case 14:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  }
+  break; 
+    case 15:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  }
+  break; 
+  */
+    default:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = *d64 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = 0 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+  static
+int gf_w8_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  uint64_t ip, m1, m2;
+  struct gf_w8_bytwo_data *btd;
+
+  h = (gf_internal_t *) gf->scratch;
+  btd = (struct gf_w8_bytwo_data *) (h->private);
+  ip = h->prim_poly & 0xff;
+  m1 = 0xfe;
+  m2 = 0x80;
+  btd->prim_poly = 0;
+  btd->mask1 = 0;
+  btd->mask2 = 0;
+
+  while (ip != 0) {
+    btd->prim_poly |= ip;
+    btd->mask1 |= m1;
+    btd->mask2 |= m2;
+    ip <<= GF_FIELD_WIDTH;
+    m1 <<= GF_FIELD_WIDTH;
+    m2 <<= GF_FIELD_WIDTH;
+  }
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    gf->multiply.w32 = gf_w8_bytwo_p_multiply;
+#ifdef INTEL_SSE2
+    if (h->region_type & GF_REGION_NOSIMD)
+      gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
+    else
+      gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
+#else
+    gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
+    if(h->region_type & GF_REGION_SIMD)
+      return 0;
+#endif
+  } else {
+    gf->multiply.w32 = gf_w8_bytwo_b_multiply;
+#ifdef INTEL_SSE2
+    if (h->region_type & GF_REGION_NOSIMD)
+      gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
+    else
+      gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
+#else
+    gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
+    if(h->region_type & GF_REGION_SIMD)
+      return 0;
+#endif
+  }
+  return 1;
+}
+
+
+/* ------------------------------------------------------------
+   General procedures.
+   You don't need to error check here on in init, because it's done
+   for you in gf_error_check().
+ */
+
+int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  switch(mult_type)
+  {
+    case GF_MULT_DEFAULT:
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
+#endif
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
+    case GF_MULT_TABLE:
+      if (region_type == GF_REGION_CAUCHY) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
+      }
+
+      if (region_type == GF_REGION_DEFAULT) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
+      } 
+      if (region_type & GF_REGION_DOUBLE_TABLE) {
+        if (region_type == GF_REGION_DOUBLE_TABLE) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_data) + 64;
+        } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64;
+        } else {
+          return 0;
+        }
+      }
+      return 0;
+      break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data);
+      break;
+    case GF_MULT_SPLIT_TABLE:
+      if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
+      }
+      break;
+    case GF_MULT_LOG_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
+      break;
+    case GF_MULT_LOG_ZERO:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
+      break;
+    case GF_MULT_LOG_ZERO_EXT:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64;
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_COMPOSITE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64;
+    default:
+      return 0;
+  }
+  return 0;
+}
+
+int gf_w8_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) { 
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0;   /* JSP: This shouldn't happen, but just in case. */
+    } else {             
+      h->prim_poly = 0x11d;
+    } 
+  }
+  if (h->mult_type != GF_MULT_COMPOSITE) { 
+    h->prim_poly |= 0x100;
+  }
+
+  gf->multiply.w32 = NULL;
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = NULL;
+  gf->multiply_region.w32 = NULL;
+  gf->extract_word.w32 = gf_w8_extract_word;
+
+  switch(h->mult_type) {
+    case GF_MULT_DEFAULT:      
+    case GF_MULT_TABLE:        if (gf_w8_table_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:      if (gf_w8_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_ZERO:
+    case GF_MULT_LOG_ZERO_EXT:
+    case GF_MULT_LOG_TABLE:    if (gf_w8_log_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:   if (gf_w8_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:        if (gf_w8_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_SPLIT_TABLE:  if (gf_w8_split_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:    if (gf_w8_composite_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    gf->divide.w32 = gf_w8_divide_from_inverse;
+    gf->inverse.w32 = gf_w8_euclid;
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    gf->divide.w32 = gf_w8_divide_from_inverse;
+    gf->inverse.w32 = gf_w8_matrix;
+  }
+
+  if (gf->divide.w32 == NULL) {
+    gf->divide.w32 = gf_w8_divide_from_inverse;
+    if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_euclid;
+  }
+
+  if (gf->inverse.w32 == NULL)  gf->inverse.w32 = gf_w8_inverse_from_divide;
+
+  if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
+    gf->extract_word.w32 = gf_w8_composite_extract_word;
+  }
+
+  if (h->region_type == GF_REGION_CAUCHY) {
+    gf->multiply_region.w32 = gf_wgen_cauchy_region;
+    gf->extract_word.w32 = gf_wgen_extract_word;
+  }
+
+  if (gf->multiply_region.w32 == NULL) {
+    gf->multiply_region.w32 = gf_w8_multiply_region_from_single;
+  }
+
+  return 1;
+}
+
+
+/* Inline setup functions */
+
+uint8_t *gf_w8_get_mult_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_default_data *ftd;
+  struct gf_w8_single_table_data *std;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w8_default_multiply) {
+    ftd = (struct gf_w8_default_data *) h->private;
+    return (uint8_t *) ftd->multtable;
+  } else if (gf->multiply.w32 == gf_w8_table_multiply) {
+    std = (struct gf_w8_single_table_data *) h->private;
+    return (uint8_t *) std->multtable;
+  }
+  return NULL;
+}
+
+uint8_t *gf_w8_get_div_table(gf_t *gf)
+{
+  struct gf_w8_default_data *ftd;
+  struct gf_w8_single_table_data *std;
+
+  if (gf->multiply.w32 == gf_w8_default_multiply) {
+    ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint8_t *) ftd->divtable;
+  } else if (gf->multiply.w32 == gf_w8_table_multiply) {
+    std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint8_t *) std->divtable;
+  }
+  return NULL;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c
new file mode 100644
index 0000000..ebc50a5
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c
@@ -0,0 +1,1019 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_wgen.c
+ *
+ * Routines for Galois fields for general w < 32.  For specific w, 
+   like 4, 8, 16, 32, 64 and 128, see the other files.
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+struct gf_wgen_table_w8_data {
+  uint8_t *mult;
+  uint8_t *div;
+  uint8_t base;
+};
+
+struct gf_wgen_table_w16_data {
+  uint16_t *mult;
+  uint16_t *div;
+  uint16_t base;
+};
+
+struct gf_wgen_log_w8_data {
+  uint8_t *log;
+  uint8_t *anti;
+  uint8_t *danti;
+  uint8_t base;
+};
+
+struct gf_wgen_log_w16_data {
+  uint16_t *log;
+  uint16_t *anti;
+  uint16_t *danti;
+  uint16_t base;
+};
+
+struct gf_wgen_log_w32_data {
+  uint32_t *log;
+  uint32_t *anti;
+  uint32_t *danti;
+  uint32_t base;
+};
+
+struct gf_wgen_group_data {
+    uint32_t *reduce;
+    uint32_t *shift;
+    uint32_t mask;
+    uint64_t rmask;
+    int tshift;
+    uint32_t memory;
+};
+
+static
+inline
+gf_val_32_t gf_wgen_inverse_from_divide (gf_t *gf, gf_val_32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+gf_val_32_t gf_wgen_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+inline
+gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b)
+{
+  
+  gf_val_32_t e_i, e_im1, e_ip1;
+  gf_val_32_t d_i, d_im1, d_ip1;
+  gf_val_32_t y_i, y_im1, y_ip1;
+  gf_val_32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = ((gf_internal_t *) (gf->scratch))->w;
+  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint8_t *ptr;
+  uint32_t rv;
+  int rs;
+  int byte, bit, i;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  rs = bytes / h->w;
+  byte = index/8;
+  bit = index%8;
+
+  ptr = (uint8_t *) start;
+  ptr += bytes;
+  ptr -= rs;
+  ptr += byte;
+
+  rv = 0;
+  for (i = 0; i < h->w; i++) {
+    rv <<= 1;
+    if ((*ptr) & (1 << bit)) rv |= 1;
+    ptr -= rs;
+  }
+  
+  return rv;
+}
+
+static
+inline
+gf_val_32_t gf_wgen_matrix (gf_t *gf, gf_val_32_t b)
+{
+  return gf_bitmatrix_inverse(b, ((gf_internal_t *) (gf->scratch))->w, 
+              ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+static
+inline
+uint32_t
+gf_wgen_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
+{
+  uint64_t product, i, pp, a, b, one;
+  gf_internal_t *h;
+ 
+  a = a32;
+  b = b32;
+  h = (gf_internal_t *) gf->scratch;
+  one = 1;
+  pp = h->prim_poly | (one << h->w);
+
+  product = 0;
+
+  for (i = 0; i < (uint64_t)h->w; i++) {
+    if (a & (one << i)) product ^= (b << i);
+  }
+  for (i = h->w*2-1; i >= (uint64_t)h->w; i--) {
+    if (product & (one << i)) product ^= (pp << (i-h->w));
+  }
+  return product;
+}
+
+static 
+int gf_wgen_shift_init(gf_t *gf)
+{
+  gf->multiply.w32 = gf_wgen_shift_multiply;
+  gf->inverse.w32 = gf_wgen_euclid;
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_wgen_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = (1 << (h->w-1));
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static 
+int gf_wgen_bytwo_b_init(gf_t *gf)
+{
+  gf->multiply.w32 = gf_wgen_bytwo_b_multiply;
+  gf->inverse.w32 = gf_wgen_euclid;
+  return 1;
+}
+
+static
+inline
+gf_val_32_t
+gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/
+  amask = pmask;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+
+static 
+int gf_wgen_bytwo_p_init(gf_t *gf)
+{
+  gf->multiply.w32 = gf_wgen_bytwo_p_multiply;
+  gf->inverse.w32 = gf_wgen_euclid;
+  return 1;
+}
+
+static
+void
+gf_wgen_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
+{
+  uint32_t i;
+  uint32_t j;
+  int g_s;
+
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    g_s = 2;
+  } else {
+    g_s = h->arg1;
+  }
+
+  shift[0] = 0;
+
+  for (i = 1; i < ((uint32_t)1 << g_s); i <<= 1) {
+    for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
+    if (val & (1 << (h->w-1))) {
+      val <<= 1;
+      val ^= h->prim_poly;
+    } else {
+      val <<= 1;
+    }
+  }
+}
+
+static
+inline
+gf_val_32_t
+gf_wgen_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int leftover, rs;
+  uint32_t p, l, ind, a32;
+  int bits_left;
+  int g_s;
+  int w;
+
+  struct gf_wgen_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+  w = h->w;
+
+  gd = (struct gf_wgen_group_data *) h->private;
+  gf_wgen_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = w % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  rs = w - leftover;
+  a32 = a;
+  ind = a32 >> rs;
+  a32 <<= leftover;
+  a32 &= gd->mask;
+  p = gd->shift[ind];
+
+  bits_left = rs;
+  rs = w - g_s;
+
+  while (bits_left > 0) {
+    bits_left -= g_s;
+    ind = a32 >> rs;
+    a32 <<= g_s;
+    a32 &= gd->mask;
+    l = p >> rs;
+    p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)) & gd->mask;
+  }
+  return p;
+}
+
+char *bits(uint32_t v)
+{
+  char *rv;
+  int i, j;
+
+  rv = malloc(30);
+  j = 0;
+  for (i = 27; i >= 0; i--) {
+    rv[j] = '0' + ((v & (1 << i)) ? 1 : 0);
+    j++;
+  }
+  rv[j] = '\0';
+  return rv;
+}
+char *bits_56(uint64_t v)
+{
+  char *rv;
+  int i, j;
+  uint64_t one;
+
+  one = 1;
+
+  rv = malloc(60);
+  j = 0;
+  for (i = 55; i >= 0; i--) {
+    rv[j] = '0' + ((v & (one << i)) ? 1 : 0);
+    j++;
+  }
+  rv[j] = '\0';
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_wgen_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int i;
+  int leftover;
+  uint64_t p, l, r;
+  uint32_t a32, ind;
+  int g_s, g_r;
+  struct gf_wgen_group_data *gd;
+  int w;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    g_s = 2;
+    g_r = 8;
+  } else {
+    g_s = h->arg1;
+    g_r = h->arg2;
+  }
+  w = h->w;
+  gd = (struct gf_wgen_group_data *) h->private;
+  gf_wgen_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = w % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  a32 = a;
+  ind = a32 >> (w - leftover);
+  p = gd->shift[ind];
+  p <<= g_s;
+  a32 <<= leftover;
+  a32 &= gd->mask;
+
+  i = (w - leftover);
+  while (i > g_s) {
+    ind = a32 >> (w-g_s);
+    p ^= gd->shift[ind];
+    a32 <<= g_s;
+    a32 &= gd->mask;
+    p <<= g_s;
+    i -= g_s;
+  }
+
+  ind = a32 >> (h->w-g_s);
+  p ^= gd->shift[ind];
+
+  for (i = gd->tshift ; i >= 0; i -= g_r) {
+    l = p & (gd->rmask << i);
+    r = gd->reduce[l >> (i+w)];
+    r <<= (i);
+    p ^= r;
+  }
+  return p & gd->mask;
+}
+
+static
+int gf_wgen_group_init(gf_t *gf)
+{
+  uint32_t i, j, p, index;
+  struct gf_wgen_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint32_t g_s, g_r;
+
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    g_s = 2;
+    g_r = 8;
+  } else {
+    g_s = h->arg1;
+    g_r = h->arg2;
+  }
+  gd = (struct gf_wgen_group_data *) h->private;
+  gd->shift = &(gd->memory);
+  gd->reduce = gd->shift + (1 << g_s);
+  gd->mask = (h->w != 31) ? ((1 << h->w)-1) : 0x7fffffff;
+
+  gd->rmask = (1 << g_r) - 1;
+  gd->rmask <<= h->w;
+
+  gd->tshift = h->w % g_s;
+  if (gd->tshift == 0) gd->tshift = g_s;
+  gd->tshift = (h->w - gd->tshift);
+  gd->tshift = ((gd->tshift-1)/g_r) * g_r;
+
+  gd->reduce[0] = 0;
+  for (i = 0; i < ((uint32_t)1 << g_r); i++) {
+    p = 0;
+    index = 0;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        p ^= (h->prim_poly << j);
+        index ^= (h->prim_poly >> (h->w-j));
+      }
+    }
+    gd->reduce[index] = (p & gd->mask);
+  }
+
+  if (g_s == g_r) {
+    gf->multiply.w32 = gf_wgen_group_s_equals_r_multiply;
+  } else {
+    gf->multiply.w32 = gf_wgen_group_multiply; 
+  }
+  gf->divide.w32 = NULL;
+  gf->divide.w32 = NULL;
+  return 1;
+}
+
+
+static
+gf_val_32_t
+gf_wgen_table_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_table_w8_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_table_w8_data *) h->private;
+
+  return (std->mult[(a<<h->w)+b]);
+}
+
+static
+gf_val_32_t
+gf_wgen_table_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_table_w8_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_table_w8_data *) h->private;
+
+  return (std->div[(a<<h->w)+b]);
+}
+
+static 
+int gf_wgen_table_8_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  int w;
+  struct gf_wgen_table_w8_data *std;
+  uint32_t a, b, p;
+  
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_table_w8_data *) h->private;
+  
+  std->mult = &(std->base);
+  std->div = std->mult + ((1<<h->w)*(1<<h->w));
+  
+  for (a = 0; a < ((uint32_t)1 << w); a++) {
+    std->mult[a] = 0;
+    std->mult[a<<w] = 0;
+    std->div[a] = 0;
+    std->div[a<<w] = 0;
+  }
+    
+  for (a = 1; a < ((uint32_t)1 << w); a++) {
+    for (b = 1; b < ((uint32_t)1 << w); b++) {
+      p = gf_wgen_shift_multiply(gf, a, b);
+      std->mult[(a<<w)|b] = p;
+      std->div[(p<<w)|a] = b;
+    }
+  }
+
+  gf->multiply.w32 = gf_wgen_table_8_multiply;
+  gf->divide.w32 = gf_wgen_table_8_divide;
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_wgen_table_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_table_w16_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_table_w16_data *) h->private;
+
+  return (std->mult[(a<<h->w)+b]);
+}
+
+static
+gf_val_32_t
+gf_wgen_table_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_table_w16_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_table_w16_data *) h->private;
+
+  return (std->div[(a<<h->w)+b]);
+}
+
+static 
+int gf_wgen_table_16_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  int w;
+  struct gf_wgen_table_w16_data *std;
+  uint32_t a, b, p;
+  
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_table_w16_data *) h->private;
+  
+  std->mult = &(std->base);
+  std->div = std->mult + ((1<<h->w)*(1<<h->w));
+  
+  for (a = 0; a < ((uint32_t)1 << w); a++) {
+    std->mult[a] = 0;
+    std->mult[a<<w] = 0;
+    std->div[a] = 0;
+    std->div[a<<w] = 0;
+  }
+  
+  for (a = 1; a < ((uint32_t)1 << w); a++) {
+    for (b = 1; b < ((uint32_t)1 << w); b++) {
+      p = gf_wgen_shift_multiply(gf, a, b);
+      std->mult[(a<<w)|b] = p;
+      std->div[(p<<w)|a] = b;
+    }
+  }
+
+  gf->multiply.w32 = gf_wgen_table_16_multiply;
+  gf->divide.w32 = gf_wgen_table_16_divide;
+  return 1;
+}
+
+static 
+int gf_wgen_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (h->w <= 8) return gf_wgen_table_8_init(gf);
+  if (h->w <= 14) return gf_wgen_table_16_init(gf);
+
+  /* Returning zero to make the compiler happy, but this won't get 
+     executed, because it is tested in _scratch_space. */
+
+  return 0;
+}
+
+static
+gf_val_32_t
+gf_wgen_log_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w8_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w8_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  return (std->anti[std->log[a]+std->log[b]]);
+}
+
+static
+gf_val_32_t
+gf_wgen_log_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w8_data *std;
+  int index;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w8_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  index = std->log[a];
+  index -= std->log[b];
+
+  return (std->danti[index]);
+}
+
+static 
+int gf_wgen_log_8_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w8_data *std;
+  int w;
+  uint32_t a, i;
+  int check = 0;
+  
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_log_w8_data *) h->private;
+  
+  std->log = &(std->base);
+  std->anti = std->log + (1<<h->w);
+  std->danti = std->anti + (1<<h->w)-1;
+  
+  for (i = 0; i < ((uint32_t)1 << w); i++)
+    std->log[i] = 0;
+
+  a = 1;
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
+    std->log[a] = i;
+    std->anti[i] = a;
+    std->danti[i] = a;
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check != 0) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
+  gf->multiply.w32 = gf_wgen_log_8_multiply;
+  gf->divide.w32 = gf_wgen_log_8_divide;
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_wgen_log_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w16_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w16_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  return (std->anti[std->log[a]+std->log[b]]);
+}
+
+static
+gf_val_32_t
+gf_wgen_log_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w16_data *std;
+  int index;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w16_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  index = std->log[a];
+  index -= std->log[b];
+
+  return (std->danti[index]);
+}
+
+static 
+int gf_wgen_log_16_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w16_data *std;
+  int w;
+  uint32_t a, i;
+  int check = 0;
+  
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_log_w16_data *) h->private;
+  
+  std->log = &(std->base);
+  std->anti = std->log + (1<<h->w);
+  std->danti = std->anti + (1<<h->w)-1;
+ 
+  for (i = 0; i < ((uint32_t)1 << w); i++)
+    std->log[i] = 0;
+
+  a = 1;
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
+    std->log[a] = i;
+    std->anti[i] = a;
+    std->danti[i] = a;
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check) {
+    if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf);
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+  
+  gf->multiply.w32 = gf_wgen_log_16_multiply;
+  gf->divide.w32 = gf_wgen_log_16_divide;
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_wgen_log_32_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w32_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w32_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  return (std->anti[std->log[a]+std->log[b]]);
+}
+
+static
+gf_val_32_t
+gf_wgen_log_32_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w32_data *std;
+  int index;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w32_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  index = std->log[a];
+  index -= std->log[b];
+
+  return (std->danti[index]);
+}
+
+static 
+int gf_wgen_log_32_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w32_data *std;
+  int w;
+  uint32_t a, i;
+  int check = 0;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_log_w32_data *) h->private;
+  
+  std->log = &(std->base);
+  std->anti = std->log + (1<<h->w);
+  std->danti = std->anti + (1<<h->w)-1;
+  
+  for (i = 0; i < ((uint32_t)1 << w); i++)
+    std->log[i] = 0;
+
+  a = 1;
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
+    std->log[a] = i;
+    std->anti[i] = a;
+    std->danti[i] = a;
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check != 0) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
+  gf->multiply.w32 = gf_wgen_log_32_multiply;
+  gf->divide.w32 = gf_wgen_log_32_divide;
+  return 1;
+}
+
+static 
+int gf_wgen_log_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (h->w <= 8) return gf_wgen_log_8_init(gf);
+  if (h->w <= 16) return gf_wgen_log_16_init(gf);
+  if (h->w <= 32) return gf_wgen_log_32_init(gf); 
+
+  /* Returning zero to make the compiler happy, but this won't get 
+     executed, because it is tested in _scratch_space. */
+
+  return 0;
+}
+
+int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+
+  switch(mult_type)
+  {
+    case GF_MULT_DEFAULT: 
+      if (w <= 8) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) +
+               sizeof(uint8_t)*(1 << w)*(1<<w)*2 + 64;
+      } else if (w <= 16) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w16_data) +
+               sizeof(uint16_t)*(1 << w)*3;
+      } else {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_group_data) +
+               sizeof(uint32_t) * (1 << 2) +
+               sizeof(uint32_t) * (1 << 8) + 64;
+      }
+    case GF_MULT_SHIFT:
+    case GF_MULT_BYTWO_b:
+    case GF_MULT_BYTWO_p:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_GROUP:
+      return sizeof(gf_internal_t) + sizeof(struct gf_wgen_group_data) +
+               sizeof(uint32_t) * (1 << arg1) +
+               sizeof(uint32_t) * (1 << arg2) + 64;
+      break;
+
+    case GF_MULT_TABLE: 
+      if (w <= 8) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) +
+               sizeof(uint8_t)*(1 << w)*(1<<w)*2 + 64;
+      } else if (w < 15) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w16_data) +
+               sizeof(uint16_t)*(1 << w)*(1<<w)*2 + 64;
+      } 
+      return 0;
+    case GF_MULT_LOG_TABLE: 
+      if (w <= 8) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w8_data) +
+               sizeof(uint8_t)*(1 << w)*3;
+      } else if (w <= 16) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w16_data) +
+               sizeof(uint16_t)*(1 << w)*3;
+      } else if (w <= 27) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w32_data) +
+               sizeof(uint32_t)*(1 << w)*3;
+      } else 
+      return 0;
+    default:
+      return 0;
+   }
+}
+
+void
+gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  gf_region_data rd;
+  int written;    
+  int rs, i, j;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, -1);
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  rs = bytes / (h->w);
+  
+  written = (xor) ? 0xffffffff : 0;
+  for (i = 0; i < h->w; i++) {
+    for (j = 0; j < h->w; j++) {
+      if (val & (1 << j)) {
+        gf_multby_one(src, ((uint8_t *)dest) + j*rs, rs, (written & (1 << j)));
+        written |= (1 << j);
+      }
+    }
+    src = (uint8_t *)src + rs;
+    val = gf->multiply.w32(gf, val, 2);
+  }
+}
+
+int gf_wgen_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (h->prim_poly == 0) {
+    switch (h->w) {
+      case 1: h->prim_poly = 1; break;
+      case 2: h->prim_poly = 7; break;
+      case 3: h->prim_poly = 013; break;
+      case 4: h->prim_poly = 023; break;
+      case 5: h->prim_poly = 045; break;
+      case 6: h->prim_poly = 0103; break;
+      case 7: h->prim_poly = 0211; break;
+      case 8: h->prim_poly = 0435; break;
+      case 9: h->prim_poly = 01021; break;
+      case 10: h->prim_poly = 02011; break;
+      case 11: h->prim_poly = 04005; break;
+      case 12: h->prim_poly = 010123; break;
+      case 13: h->prim_poly = 020033; break;
+      case 14: h->prim_poly = 042103; break;
+      case 15: h->prim_poly = 0100003; break;
+      case 16: h->prim_poly = 0210013; break;
+      case 17: h->prim_poly = 0400011; break;
+      case 18: h->prim_poly = 01000201; break;
+      case 19: h->prim_poly = 02000047; break;
+      case 20: h->prim_poly = 04000011; break;
+      case 21: h->prim_poly = 010000005; break;
+      case 22: h->prim_poly = 020000003; break;
+      case 23: h->prim_poly = 040000041; break;
+      case 24: h->prim_poly = 0100000207; break;
+      case 25: h->prim_poly = 0200000011; break;
+      case 26: h->prim_poly = 0400000107; break;
+      case 27: h->prim_poly = 01000000047; break;
+      case 28: h->prim_poly = 02000000011; break;
+      case 29: h->prim_poly = 04000000005; break;
+      case 30: h->prim_poly = 010040000007; break;
+      case 31: h->prim_poly = 020000000011; break;
+      case 32: h->prim_poly = 00020000007; break;
+      default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1);
+    }
+  } else {
+    if (h->w == 32) {
+      h->prim_poly &= 0xffffffff;
+    } else {
+      h->prim_poly |= (1 << h->w);
+      if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0;
+    }
+  }
+
+  gf->multiply.w32 = NULL;
+  gf->divide.w32 = NULL;
+  gf->inverse.w32 = NULL;
+  gf->multiply_region.w32 = gf_wgen_cauchy_region;
+  gf->extract_word.w32 = gf_wgen_extract_word;
+
+  switch(h->mult_type) {
+    case GF_MULT_DEFAULT:
+      if (h->w <= 8) {
+        if (gf_wgen_table_init(gf) == 0) return 0; 
+      } else if (h->w <= 16) {
+        if (gf_wgen_log_init(gf) == 0) return 0; 
+      } else {
+        if (gf_wgen_bytwo_p_init(gf) == 0) return 0; 
+      }
+      break;
+    case GF_MULT_SHIFT:     if (gf_wgen_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_b:     if (gf_wgen_bytwo_b_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p:     if (gf_wgen_bytwo_p_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:     if (gf_wgen_group_init(gf) == 0) return 0; break;
+    case GF_MULT_TABLE:     if (gf_wgen_table_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE: if (gf_wgen_log_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    gf->divide.w32 = gf_wgen_divide_from_inverse;
+    gf->inverse.w32 = gf_wgen_euclid;
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    gf->divide.w32 = gf_wgen_divide_from_inverse;
+    gf->inverse.w32 = gf_wgen_matrix;
+  }
+
+  if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) gf->inverse.w32 = gf_wgen_euclid;
+
+  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+    gf->divide.w32 = gf_wgen_divide_from_inverse;
+  }
+  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
+    gf->inverse.w32 = gf_wgen_inverse_from_divide;
+  }
+  return 1;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c
new file mode 100644
index 0000000..95bfd80
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c
@@ -0,0 +1,356 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j at jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * gf_w16_neon.c
+ *
+ * Neon routines for 16-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w16.h"
+
+#ifdef ARCH_AARCH64
+static
+inline
+void
+neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
+                                 uint16_t *d_end, uint8_t *tbl,
+                                 gf_val_32_t val, int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint16x8_t va0, va1, r0, r1;
+  uint8x16_t loset, rl, rh;
+  uint8x16x2_t va;
+
+  uint8x16_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i] = vld1q_u8(tbl + i*16);
+      tbl_h[i] = vld1q_u8(high + i*16);
+  }
+
+  loset = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+      va0 = vld1q_u16(src);
+      va1 = vld1q_u16(src + 8);
+
+      va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
+
+      rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
+      rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
+
+      va.val[0] = vshrq_n_u8(va.val[0], 4);
+      va.val[1] = vshrq_n_u8(va.val[1], 4);
+
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+
+      va = vtrnq_u8(rl, rh);
+      r0 = vreinterpretq_u16_u8(va.val[0]);
+      r1 = vreinterpretq_u16_u8(va.val[1]);
+
+      if (xor) {
+          va0 = vld1q_u16(dst);
+          va1 = vld1q_u16(dst + 8);
+          r0 = veorq_u16(r0, va0);
+          r1 = veorq_u16(r1, va1);
+      }
+      vst1q_u16(dst, r0);
+      vst1q_u16(dst + 8, r1);
+
+      src += 16;
+      dst += 16;
+  }
+}
+
+static
+inline
+void
+neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
+                                        uint8_t *dst, uint8_t *d_end,
+                                        uint8_t *tbl, gf_val_32_t val,
+                                        int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint8x16_t vh, vl, rh, rl;
+  uint8x16_t loset;
+
+  uint8x16_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i] = vld1q_u8(tbl + i*16);
+      tbl_h[i] = vld1q_u8(high + i*16);
+  }
+
+  loset = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+      vh = vld1q_u8(src);
+      vl = vld1q_u8(src + 16);
+
+      rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
+      rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
+
+      vl = vshrq_n_u8(vl, 4);
+      vh = vshrq_n_u8(vh, 4);
+
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
+
+      if (xor) {
+          vh = vld1q_u8(dst);
+          vl = vld1q_u8(dst + 16);
+          rh = veorq_u8(rh, vh);
+          rl = veorq_u8(rl, vl);
+      }
+      vst1q_u8(dst, rh);
+      vst1q_u8(dst + 16, rl);
+
+      src += 32;
+      dst += 32;
+  }
+}
+
+#else /* ARCH_AARCH64 */
+
+static
+inline
+void
+neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
+                                 uint16_t *d_end, uint8_t *tbl,
+                                 gf_val_32_t val, int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint16x8_t va, r;
+  uint8x8_t loset, vb, vc, rl, rh;
+
+  uint8x8x2_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+      tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+      tbl_h[i].val[0] = vld1_u8(high + i*16);
+      tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+  }
+
+  loset = vdup_n_u8(0xf);
+
+  while (dst < d_end) {
+      va = vld1q_u16(src);
+
+      vb = vmovn_u16(va);
+      vc = vshrn_n_u16(va, 8);
+
+      rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
+      rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
+      vb = vshr_n_u8(vb, 4);
+      rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
+      rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
+      vc = vshr_n_u8(vc, 4);
+      rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
+      rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
+      rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
+      rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
+
+      r  = vmovl_u8(rl);
+      r  = vorrq_u16(r, vshll_n_u8(rh, 8));
+
+      if (xor) {
+          va = vld1q_u16(dst);
+          r = veorq_u16(r, va);
+      }
+      vst1q_u16(dst, r);
+
+      src += 8;
+      dst += 8;
+  }
+}
+
+static
+inline
+void
+neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
+                                        uint8_t *dst, uint8_t *d_end,
+                                        uint8_t *tbl, gf_val_32_t val,
+                                        int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
+  uint8x8_t loset;
+
+  uint8x8x2_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+      tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+      tbl_h[i].val[0] = vld1_u8(high + i*16);
+      tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+  }
+
+  loset = vdup_n_u8(0xf);
+
+  while (dst < d_end) {
+      vh0 = vld1_u8(src);
+      vh1 = vld1_u8(src + 8);
+      vl0 = vld1_u8(src + 16);
+      vl1 = vld1_u8(src + 24);
+
+      r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
+      r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
+      r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
+      r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
+
+      vh0 = vshr_n_u8(vh0, 4);
+      vh1 = vshr_n_u8(vh1, 4);
+      vl0 = vshr_n_u8(vl0, 4);
+      vl1 = vshr_n_u8(vl1, 4);
+
+      r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
+      r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
+      r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
+      r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
+
+      if (xor) {
+          vh0 = vld1_u8(dst);
+          vh1 = vld1_u8(dst + 8);
+          vl0 = vld1_u8(dst + 16);
+          vl1 = vld1_u8(dst + 24);
+          r0  = veor_u8(r0, vh0);
+          r1  = veor_u8(r1, vh1);
+          r2  = veor_u8(r2, vl0);
+          r3  = veor_u8(r3, vl1);
+      }
+      vst1_u8(dst,      r0);
+      vst1_u8(dst +  8, r1);
+      vst1_u8(dst + 16, r2);
+      vst1_u8(dst + 24, r3);
+
+      src += 32;
+      dst += 32;
+  }
+}
+#endif /* ARCH_AARCH64 */
+
+static
+inline
+void
+neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest,
+                                         gf_val_32_t val, int bytes, int xor,
+                                         int altmap)
+{
+  gf_region_data rd;
+  unsigned i, j;
+  uint64_t c, prod;
+  uint8_t tbl[2 * 4 * 16];
+  uint8_t *high = tbl + 4 * 16;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 16; j++) {
+      c = (j << (i*4));
+      prod = gf->multiply.w32(gf, c, val);
+      tbl[i*16 + j]  = prod & 0xff;
+      high[i*16 + j] = prod >> 8;
+    }
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  if (altmap) {
+    uint8_t *s8   = rd.s_start;
+    uint8_t *d8   = rd.d_start;
+    uint8_t *end8 = rd.d_top;
+    if (xor)
+      neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1);
+    else
+      neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0);
+  } else {
+    uint16_t *s16   = rd.s_start;
+    uint16_t *d16   = rd.d_start;
+    uint16_t *end16 = rd.d_top;
+    if (xor)
+      neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1);
+    else
+      neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0);
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest,
+                                                   gf_val_32_t val, int bytes,
+                                                   int xor)
+{
+  neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+
+void gf_w16_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+    gf->multiply_region.w32 = gf_w16_split_4_16_lazy_altmap_multiply_region_neon;
+  else
+    gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region_neon;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c
new file mode 100644
index 0000000..8231eb3
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c
@@ -0,0 +1,269 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j at jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w32_neon.c
+ *
+ * Neon routines for 32-bit Galois fields
+ *
+ */
+
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w32.h"
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+void
+neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
+                                    uint32_t *d_end, uint8_t btable[8][4][16],
+                                    uint32_t val, int xor, int altmap)
+{
+  int i, j;
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[8][4];
+#else
+  uint8x8x2_t tables[8][4];
+#endif
+  uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
+  uint8x16_t p0, p1, p2, p3, si, mask1;
+  uint16x8x2_t r0, r1;
+  uint8x16x2_t q0, q1;
+
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 4; j++) {
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable[i][j]);
+#else
+      tables[i][j].val[0] = vld1_u8(btable[i][j]);
+      tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+      v0 = vld1q_u32(src); src += 4;
+      v1 = vld1q_u32(src); src += 4;
+      v2 = vld1q_u32(src); src += 4;
+      v3 = vld1q_u32(src); src += 4;
+
+      if (altmap) {
+          q0.val[0] = vreinterpretq_u8_u32(v0);
+          q0.val[1] = vreinterpretq_u8_u32(v1);
+          q1.val[0] = vreinterpretq_u8_u32(v2);
+          q1.val[1] = vreinterpretq_u8_u32(v3);
+      } else {
+          r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
+          r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));
+
+          q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
+                        vreinterpretq_u8_u16(r1.val[0]));
+          q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
+                        vreinterpretq_u8_u16(r1.val[1]));
+      }
+
+      si = vandq_u8(q0.val[0], mask1);
+      p0 = vqtbl1q_u8(tables[0][0], si);
+      p1 = vqtbl1q_u8(tables[0][1], si);
+      p2 = vqtbl1q_u8(tables[0][2], si);
+      p3 = vqtbl1q_u8(tables[0][3], si);
+
+      si = vshrq_n_u8(q0.val[0], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));
+
+      si = vandq_u8(q0.val[1], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));
+
+      si = vshrq_n_u8(q0.val[1], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));
+
+      si = vandq_u8(q1.val[0], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));
+
+      si = vshrq_n_u8(q1.val[0], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));
+
+      si = vandq_u8(q1.val[1], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));
+
+      si = vshrq_n_u8(q1.val[1], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));
+
+      if (altmap) {
+          s0 = vreinterpretq_u32_u8(p0);
+          s1 = vreinterpretq_u32_u8(p1);
+          s2 = vreinterpretq_u32_u8(p2);
+          s3 = vreinterpretq_u32_u8(p3);
+      } else {
+          q0 = vtrnq_u8(p0, p1);
+          q1 = vtrnq_u8(p2, p3);
+
+          r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
+                         vreinterpretq_u16_u8(q1.val[0]));
+          r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
+                         vreinterpretq_u16_u8(q1.val[1]));
+
+          s0 = vreinterpretq_u32_u16(r0.val[0]);
+          s1 = vreinterpretq_u32_u16(r1.val[0]);
+          s2 = vreinterpretq_u32_u16(r0.val[1]);
+          s3 = vreinterpretq_u32_u16(r1.val[1]);
+      }
+
+      if (xor) {
+          v0 = vld1q_u32(dst);
+          v1 = vld1q_u32(dst + 4);
+          v2 = vld1q_u32(dst + 8);
+          v3 = vld1q_u32(dst + 12);
+          s0 = veorq_u32(s0, v0);
+          s1 = veorq_u32(s1, v1);
+          s2 = veorq_u32(s2, v2);
+          s3 = veorq_u32(s3, v3);
+      }
+
+      vst1q_u32(dst,      s0);
+      vst1q_u32(dst + 4,  s1);
+      vst1q_u32(dst + 8,  s2);
+      vst1q_u32(dst + 12, s3);
+
+      dst += 16;
+  }
+}
+
+static
+inline
+void
+neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
+  uint8_t btable[8][4][16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  v = val;
+  for (i = 0; i < 8; i++) {
+    tmp_table[0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        tmp_table[k^j] = (v ^ tmp_table[k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 4; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[i][j][k] = (uint8_t) tmp_table[k];
+        tmp_table[k] >>= 8;
+      }
+    }
+  }
+
+  if (xor)
+    neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap);
+  else
+    neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest, gf_val_32_t val,
+                                                   int bytes, int xor)
+{
+  neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+void gf_w32_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon;
+  else
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon;
+
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c
new file mode 100644
index 0000000..3a21432
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c
@@ -0,0 +1,247 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j at jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w4_neon.c
+ *
+ * Neon routines for 4-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w4.h"
+
+static
+gf_val_32_t
+gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+  gf_val_32_t rv = 0;
+  poly8x8_t       result, prim_poly;
+  poly8x8_t       a, b, w;
+  uint8x8_t       v;
+  gf_internal_t * h = gf->scratch;
+
+  a =  vdup_n_p8 (a4);
+  b =  vdup_n_p8 (b4);
+
+  prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL));
+
+  /* Do the initial multiply */
+  result = vmul_p8 (a, b);
+  v = vshr_n_u8 (vreinterpret_u8_p8(result), 4);
+  w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+  result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w)));
+
+  /* Extracts 32 bit value from result. */
+  rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0);
+
+  return rv;
+}
+
+static inline void
+neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8,
+                                      gf_val_32_t val, uint8_t *d_end, int xor)
+{
+  gf_internal_t * h = gf->scratch;
+  poly8x8_t       prim_poly;
+  poly8x8_t       a, w, even, odd;
+  uint8x8_t       b, c, v, mask;
+
+  a         = vdup_n_p8 (val);
+  mask      = vdup_n_u8 (0xf);
+  prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL));
+
+  while (d8 < d_end) {
+    b = vld1_u8 (s8);
+
+    even = vreinterpret_p8_u8 (vand_u8 (b, mask));
+    odd  = vreinterpret_p8_u8 (vshr_n_u8 (b, 4));
+
+    if (xor)
+        c = vld1_u8 (d8);
+
+    even = vmul_p8 (a, even);
+    odd  = vmul_p8 (a, odd);
+
+    v = vshr_n_u8 (vreinterpret_u8_p8(even), 4);
+    w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+    even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w)));
+
+    v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4);
+    w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+    odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w)));
+
+    v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4));
+
+    if (xor)
+      v = veor_u8 (c, v);
+
+    vst1_u8 (d8, v);
+
+    d8 += 8;
+    s8 += 8;
+  }
+}
+
+
+static void
+gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor)
+    neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1);
+  else
+    neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst,
+                                     uint8_t * d_end, gf_val_32_t val, int xor)
+{
+  struct gf_single_table_data *std;
+  uint8_t *base;
+  uint8x16_t r, va, vh, vl, loset;
+
+#ifdef ARCH_AARCH64
+  uint8x16_t th, tl;
+#else
+  uint8x8x2_t th, tl;
+#endif
+
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  base = (uint8_t *) std->mult;
+  base += (val << GF_FIELD_WIDTH);
+
+#ifdef ARCH_AARCH64
+  tl = vld1q_u8 (base);
+  th = vshlq_n_u8 (tl, 4);
+#else
+  tl.val[0] = vld1_u8 (base);
+  tl.val[1] = vld1_u8 (base + 8);
+  th.val[0] =  vshl_n_u8 (tl.val[0], 4);
+  th.val[1] =  vshl_n_u8 (tl.val[1], 4);
+#endif
+
+  loset = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+      va = vld1q_u8 (src);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+
+      if (xor)
+        va = vld1q_u8 (dst);
+
+      vh = vqtbl1q_u8 (th, vh);
+      vl = vqtbl1q_u8 (tl, vl);
+
+      r = veorq_u8 (vh, vl);
+
+      if (xor)
+        r = veorq_u8 (va, r);
+
+      vst1q_u8 (dst, r);
+
+    dst += 16;
+    src += 16;
+  }
+}
+
+static
+void
+gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                        gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint8_t *sptr, *dptr, *top;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+  top  = rd.d_top;
+
+  if (xor)
+      w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1);
+  else
+      w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0);
+
+  gf_do_final_region_alignment(&rd);
+
+}
+
+
+int gf_w4_neon_cfm_init(gf_t *gf)
+{
+  // single clm multiplication probably pointless
+  gf->multiply.w32 = gf_w4_neon_clm_multiply;
+  gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single;
+
+  return 1;
+}
+
+void gf_w4_neon_single_table_init(gf_t *gf)
+{
+  gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
new file mode 100644
index 0000000..0eca9c7
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
@@ -0,0 +1,333 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j at jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w64_neon.c
+ *
+ * Neon routines for 64-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w64.h"
+
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src,
+                                             uint64_t *dst, uint64_t *d_end,
+                                             uint64_t val, int xor)
+{
+  unsigned i, j, k;
+  uint8_t btable[16];
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[16][8];
+#else
+  uint8x8x2_t tables[16][8];
+#endif
+  uint8x16_t p[8], mask1, si;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  for (i = 0; i < 16; i++) {
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable);
+#else
+      tables[i][j].val[0] = vld1_u8(btable);
+      tables[i][j].val[1] = vld1_u8(btable + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+    if (xor) {
+      for (i = 0; i < 8; i++)
+        p[i] = vld1q_u8((uint8_t *) (dst + i * 2));
+    } else {
+      for (i = 0; i < 8; i++)
+        p[i] = vdupq_n_u8(0);
+    }
+
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      uint8x16_t v0 = vld1q_u8((uint8_t *) src);
+      src += 2;
+
+      si = vandq_u8(v0, mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+      si = vshrq_n_u8(v0, 4);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+
+    }
+    for (i = 0; i < 8; i++) {
+      vst1q_u8((uint8_t *) dst, p[i]);
+      dst += 2;
+    }
+  }
+}
+
+static
+inline
+void
+neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst,
+                                      uint64_t *d_end, uint64_t val, int xor)
+{
+  unsigned i, j, k;
+  uint8_t btable[16];
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[16][8];
+#else
+  uint8x8x2_t tables[16][8];
+#endif
+  uint8x16_t p[8], mask1, si;
+  uint64x2_t st[8];
+  uint32x4x2_t s32[4];
+  uint16x8x2_t s16[4];
+  uint8x16x2_t s8[4];
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  for (i = 0; i < 16; i++) {
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable);
+#else
+      tables[i][j].val[0] = vld1_u8(btable);
+      tables[i][j].val[1] = vld1_u8(btable + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+    for (k = 0; k < 8; k++) {
+      st[k]  = vld1q_u64(src);
+      src += 2;
+      p[k] = vdupq_n_u8(0);
+    }
+
+    s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]),
+                       vreinterpretq_u32_u64(st[1]));
+    s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]),
+                       vreinterpretq_u32_u64(st[3]));
+    s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]),
+                       vreinterpretq_u32_u64(st[5]));
+    s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]),
+                       vreinterpretq_u32_u64(st[7]));
+
+    s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]),
+                       vreinterpretq_u16_u32(s32[1].val[0]));
+    s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]),
+                       vreinterpretq_u16_u32(s32[3].val[0]));
+    s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]),
+                       vreinterpretq_u16_u32(s32[1].val[1]));
+    s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]),
+                       vreinterpretq_u16_u32(s32[3].val[1]));
+
+    s8[0]  = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]),
+                      vreinterpretq_u8_u16(s16[1].val[0]));
+    s8[1]  = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]),
+                      vreinterpretq_u8_u16(s16[1].val[1]));
+    s8[2]  = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]),
+                      vreinterpretq_u8_u16(s16[3].val[0]));
+    s8[3]  = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]),
+                      vreinterpretq_u8_u16(s16[3].val[1]));
+
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      si = vandq_u8(s8[k >> 1].val[k & 1], mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+      si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+    }
+
+    s8[0]  = vzipq_u8(p[0], p[1]);
+    s8[1]  = vzipq_u8(p[2], p[3]);
+    s8[2]  = vzipq_u8(p[4], p[5]);
+    s8[3]  = vzipq_u8(p[6], p[7]);
+
+    s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]),
+                       vreinterpretq_u16_u8(s8[1].val[0]));
+    s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]),
+                       vreinterpretq_u16_u8(s8[3].val[0]));
+    s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]),
+                       vreinterpretq_u16_u8(s8[1].val[1]));
+    s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]),
+                       vreinterpretq_u16_u8(s8[3].val[1]));
+
+    s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]),
+                       vreinterpretq_u32_u16(s16[1].val[0]));
+    s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]),
+                       vreinterpretq_u32_u16(s16[1].val[1]));
+    s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]),
+                       vreinterpretq_u32_u16(s16[3].val[0]));
+    s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]),
+                       vreinterpretq_u32_u16(s16[3].val[1]));
+
+    for (k = 0; k < 8; k ++) {
+        st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]);
+    }
+
+    if (xor) {
+      for (i = 0; i < 8; i++) {
+        uint64x2_t t1 = vld1q_u64(dst);
+        vst1q_u64(dst, veorq_u64(st[i], t1));
+        dst += 2;
+      }
+    } else {
+      for (i = 0; i < 8; i++) {
+        vst1q_u64(dst, st[i]);
+        dst += 2;
+      }
+    }
+
+  }
+}
+
+static
+void
+gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest,
+                                         uint64_t val, int bytes, int xor,
+                                         int altmap)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v, *s64, *d64, *top;
+  struct gf_split_4_64_lazy_data *ld;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  v = val;
+  for (i = 0; i < 16; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+  }
+
+  if (altmap) {
+    if (xor)
+      neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1);
+    else
+      neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0);
+  } else {
+    if (xor)
+      neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1);
+    else
+      neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0);
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            uint64_t val, int bytes, int xor)
+{
+  gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest, uint64_t val,
+                                                   int bytes, int xor)
+{
+  gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+void gf_w64_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+      gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon;
+  else
+      gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon;
+
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c
new file mode 100644
index 0000000..930a916
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c
@@ -0,0 +1,302 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j at jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w8_neon.c
+ *
+ * Neon optimized routines for 8-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include "gf_w8.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+/* ARM NEON reducing macro for the carry free multiplication
+ *   vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts
+ *   the result to the right by 1 byte. This allows us to multiply
+ *   the prim_poly by the leading bits of the result. We then xor the result
+ *   of that operation back with the result. */
+#define NEON_CFM_REDUCE(v, w, result, prim_poly, initial)               \
+  do {								        \
+    if (initial)                                                        \
+      v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8);               \
+    else                                                                \
+      v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8));  \
+    w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v));                    \
+    result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \
+  } while (0)
+
+static
+inline
+gf_val_32_t
+gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x)
+{
+  gf_val_32_t rv = 0;
+  poly8x8_t       a, b;
+  uint8x8_t       v;
+  poly16x8_t      result;
+  poly8x8_t       prim_poly;
+  poly16x8_t      w;
+  gf_internal_t * h = gf->scratch;
+
+  a =  vdup_n_p8 (a8);
+  b =  vdup_n_p8 (b8);
+
+  prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  result = vmull_p8 (a, b);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1 */
+  NEON_CFM_REDUCE (v, w, result, prim_poly, 1);
+  NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  if (x >= 3) {
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  }
+  if (x >= 4) {
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  }
+  /* Extracts 32 bit value from result. */
+  rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0);
+
+  return rv;
+}
+
+#define CLM_MULTIPLY(x) \
+static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \
+{\
+    return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\
+}
+
+CLM_MULTIPLY(2)
+CLM_MULTIPLY(3)
+CLM_MULTIPLY(4)
+
+static inline void
+neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8,
+                                       gf_val_32_t val, uint8_t *d_end,
+                                       int xor, int x)
+{
+  gf_internal_t * h = gf->scratch;
+  poly8x8_t       a, b;
+  uint8x8_t       c, v;
+  poly16x8_t      result;
+  poly8x8_t       prim_poly;
+  poly16x8_t      w;
+
+  a         = vdup_n_p8 (val);
+  prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL));
+
+  while (d8 < d_end) {
+    b = vld1_p8 ((poly8_t *) s8);
+
+    if (xor)
+        c = vld1_u8 (d8);
+
+    result = vmull_p8 (a, b);
+
+    NEON_CFM_REDUCE(v, w, result, prim_poly, 1);
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    if (x >= 3) {
+      NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    }
+    if (x >= 4) {
+      NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    }
+    v = vmovn_u16 (vreinterpretq_u16_p16 (result));
+    if (xor)
+      v = veor_u8 (c, v);
+
+    vst1_u8 (d8, v);
+
+    d8 += 8;
+    s8 += 8;
+  }
+}
+
+#define CLM_MULT_REGION(x)                                              \
+static void                                                             \
+gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src,  \
+                                                  void *dest,           \
+                                                  gf_val_32_t val, int bytes, \
+                                                  int xor)              \
+{                                                                       \
+  gf_region_data rd;                                                    \
+  uint8_t *s8;                                                          \
+  uint8_t *d8;                                                          \
+                                                                        \
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }           \
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }       \
+                                                                        \
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);          \
+  gf_do_initial_region_alignment(&rd);                                  \
+  s8 = (uint8_t *) rd.s_start;                                          \
+  d8 = (uint8_t *) rd.d_start;                                          \
+                                                                        \
+  if (xor)                                                              \
+    neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \
+  else                                                                  \
+    neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\
+  gf_do_final_region_alignment(&rd);                                    \
+}
+
+CLM_MULT_REGION(2)
+CLM_MULT_REGION(3)
+CLM_MULT_REGION(4)
+
+
+int gf_w8_neon_cfm_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if ((0xe0 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w8_neon_clm_multiply_2;
+    gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_2;
+  }else if ((0xc0 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w8_neon_clm_multiply_3;
+    gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_3;
+  }else if ((0x80 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w8_neon_clm_multiply_4;
+    gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_4;
+  }else{
+    return 0;
+  }
+  return 1;
+}
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+void
+gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint8_t *bh, *bl, *sptr, *dptr;
+  uint8x16_t r, va, vh, vl, loset;
+#ifdef ARCH_AARCH64
+  uint8x16_t mth, mtl;
+#else
+  uint8x8x2_t mth, mtl;
+#endif
+  struct gf_w8_half_table_data *htd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  bh = (uint8_t *) htd->high;
+  bh += (val << 4);
+  bl = (uint8_t *) htd->low;
+  bl += (val << 4);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+
+#ifdef ARCH_AARCH64
+  mth = vld1q_u8 (bh);
+  mtl = vld1q_u8 (bl);
+#else
+  mth.val[0] = vld1_u8 (bh);
+  mtl.val[0] = vld1_u8 (bl);
+  mth.val[1] = vld1_u8 (bh + 8);
+  mtl.val[1] = vld1_u8 (bl + 8);
+#endif
+
+  loset = vdupq_n_u8(0xf);
+
+  if (xor) {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = vld1q_u8 (sptr);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+      va = vld1q_u8 (dptr);
+
+      vh = vqtbl1q_u8 (mth, vh);
+      vl = vqtbl1q_u8 (mtl, vl);
+
+      r = veorq_u8 (vh, vl);
+
+      vst1q_u8 (dptr, veorq_u8 (va, r));
+
+      dptr += 16;
+      sptr += 16;
+    }
+  } else {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = vld1q_u8 (sptr);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+#ifdef ARCH_AARCH64
+      vh = vqtbl1q_u8 (mth, vh);
+      vl = vqtbl1q_u8 (mtl, vl);
+#else
+      vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)),
+			vtbl2_u8 (mth, vget_high_u8 (vh)));
+      vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)),
+			vtbl2_u8 (mtl, vget_high_u8 (vl)));
+#endif
+
+      r = veorq_u8 (vh, vl);
+
+      vst1q_u8(dptr, r);
+
+      dptr += 16;
+      sptr += 16;
+    }
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+
+void gf_w8_neon_split_init(gf_t *gf)
+{
+  gf->multiply_region.w32 = gf_w8_split_multiply_region_neon;
+}
diff --git a/src/erasure-code/jerasure/jerasure/include/cauchy.h b/src/erasure-code/jerasure/jerasure/include/cauchy.h
new file mode 100644
index 0000000..a4fad6b
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/include/cauchy.h
@@ -0,0 +1,45 @@
+/* *
+ * Copyright (c) 2013, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+extern int *cauchy_original_coding_matrix(int k, int m, int w);
+extern int *cauchy_xy_coding_matrix(int k, int m, int w, int *x, int *y);
+extern void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix);
+extern int *cauchy_good_general_coding_matrix(int k, int m, int w);
+extern int cauchy_n_ones(int n, int w);
diff --git a/src/erasure-code/jerasure/jerasure/include/galois.h b/src/erasure-code/jerasure/jerasure/include/galois.h
new file mode 100644
index 0000000..b57ef3c
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/include/galois.h
@@ -0,0 +1,100 @@
+/* *
+ * Copyright (c) 2013, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _GALOIS_H
+#define _GALOIS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <gf_complete.h>
+
+extern int galois_init_default_field(int w);
+extern void galois_change_technique(gf_t *gf, int w);
+
+extern int galois_single_multiply(int a, int b, int w);
+extern int galois_single_divide(int a, int b, int w);
+extern int galois_inverse(int x, int w);
+
+void galois_region_xor(           char *src,         /* Source Region */
+                                  char *dest,        /* Dest Region (holds result) */
+                                  int nbytes);      /* Number of bytes in region */
+
+/* These multiply regions in w=8, w=16 and w=32.  They are much faster
+   than calling galois_single_multiply.  The regions must be long word aligned. */
+
+void galois_w08_region_multiply(char *region,       /* Region to multiply */
+                                  int multby,       /* Number to multiply by */
+                                  int nbytes,       /* Number of bytes in region */
+                                  char *r2,         /* If r2 != NULL, products go here.  
+                                                       Otherwise region is overwritten */
+                                  int add);         /* If (r2 != NULL && add) the produce is XOR'd with r2 */
+
+void galois_w16_region_multiply(char *region,       /* Region to multiply */
+                                  int multby,       /* Number to multiply by */
+                                  int nbytes,       /* Number of bytes in region */
+                                  char *r2,         /* If r2 != NULL, products go here.  
+                                                       Otherwise region is overwritten */
+                                  int add);         /* If (r2 != NULL && add) the produce is XOR'd with r2 */
+
+void galois_w32_region_multiply(char *region,       /* Region to multiply */
+                                  int multby,       /* Number to multiply by */
+                                  int nbytes,       /* Number of bytes in region */
+                                  char *r2,         /* If r2 != NULL, products go here.  
+                                                       Otherwise region is overwritten */
+                                  int add);         /* If (r2 != NULL && add) the produce is XOR'd with r2 */
+
+gf_t* galois_init_field(int w,
+                             int mult_type,
+                             int region_type,
+                             int divide_type,
+                             uint64_t prim_poly,
+                             int arg1,
+                             int arg2);
+
+gf_t* galois_init_composite_field(int w,
+                                int region_type,
+                                int divide_type,
+                                int degree,
+                                gf_t* base_gf);
+
+gf_t * galois_get_field_ptr(int w);
+
+
+#endif
diff --git a/src/erasure-code/jerasure/jerasure/include/jerasure.h b/src/erasure-code/jerasure/jerasure/include/jerasure.h
new file mode 100644
index 0000000..0836780
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/include/jerasure.h
@@ -0,0 +1,294 @@
+/* *
+ * Copyright (c) 2013, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _JERASURE_H
+#define _JERASURE_H
+
+/* This uses procedures from the Galois Field arithmetic library */
+
+#include "galois.h"
+
+/* ------------------------------------------------------------ */
+/* In all of the routines below:
+
+   k = Number of data devices
+   m = Number of coding devices
+   w = Word size
+
+   data_ptrs = An array of k pointers to data which is size bytes.  
+               Size must be a multiple of sizeof(long).
+               Pointers must also be longword aligned.
+ 
+   coding_ptrs = An array of m pointers to coding data which is size bytes.
+
+   packetsize = The size of a coding block with bitmatrix coding. 
+                When you code with a bitmatrix, you will use w packets
+                of size packetsize.
+
+   matrix = an array of k*m integers.  
+            It represents an m by k matrix.
+            Element i,j is in matrix[i*k+j];
+
+   bitmatrix = an array of k*m*w*w integers.
+            It represents an mw by kw matrix.
+            Element i,j is in matrix[i*k*w+j];
+
+   erasures = an array of id's of erased devices. 
+              Id's are integers between 0 and k+m-1.
+              Id's 0 to k-1 are id's of data devices.
+              Id's k to k+m-1 are id's of coding devices: 
+                  Coding device id = id-k.
+              If there are e erasures, erasures[e] = -1.
+
+   schedule = an array of schedule operations.  
+
+              If there are m operations, then schedule[m][0] = -1.
+
+   operation = an array of 5 integers:
+
+          0 = operation: 0 for copy, 1 for xor (-1 for end)
+          1 = source device (0 - k+m-1)
+          2 = source packet (0 - w-1)
+          3 = destination device (0 - k+m-1)
+          4 = destination packet (0 - w-1)
+ */
+
+/* ---------------------------------------------------------------  */
+/* Bitmatrices / schedules ---------------------------------------- */
+/*
+ - jerasure_matrix_to_bitmatrix turns a m X k matrix in GF(2^w) into a
+                              wm X wk bitmatrix (in GF(2)).  This is
+                              explained in the Cauchy Reed-Solomon coding
+                              paper.
+
+ - jerasure_dumb_bitmatrix_to_schedule turns a bitmatrix into a schedule 
+                              using the straightforward algorithm -- just
+                              schedule the dot products defined by each
+                              row of the matrix.
+
+ - jerasure_smart_bitmatrix_to_schedule turns a bitmatrix into a schedule,
+                              but tries to use previous dot products to
+                              calculate new ones.  This is the optimization
+                              explained in the original Liberation code paper.
+
+ - jerasure_generate_schedule_cache precalcalculate all the schedule for the
+                              given distribution bitmatrix.  M must equal 2.
+ 
+ - jerasure_free_schedule frees a schedule that was allocated with 
+                              jerasure_XXX_bitmatrix_to_schedule.
+ 
+ - jerasure_free_schedule_cache frees a schedule cache that was created with 
+                              jerasure_generate_schedule_cache.
+ */
+
+int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix);
+int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix);
+int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix);
+int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart);
+
+void jerasure_free_schedule(int **schedule);
+void jerasure_free_schedule_cache(int k, int m, int ***cache);
+
+
+/* ------------------------------------------------------------ */
+/* Encoding - these are all straightforward.  jerasure_matrix_encode only 
+   works with w = 8|16|32.  */
+
+void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size);
+
+void jerasure_matrix_encode(int k, int m, int w, int *matrix,
+                          char **data_ptrs, char **coding_ptrs, int size);
+
+void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix,
+                            char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+void jerasure_schedule_encode(int k, int m, int w, int **schedule,
+                                  char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+/* ------------------------------------------------------------ */
+/* Decoding. -------------------------------------------------- */
+
+/* These return integers, because the matrix may not be invertible. 
+   
+   The parameter row_k_ones should be set to 1 if row k of the matrix
+   (or rows kw to (k+1)w+1) of th distribution matrix are all ones
+   (or all identity matrices).  Then you can improve the performance
+   of decoding when there is more than one failure, and the parity
+   device didn't fail.  You do it by decoding all but one of the data
+   devices, and then decoding the last data device from the data devices
+   and the parity device.
+
+   jerasure_schedule_decode_lazy generates the schedule on the fly.
+
+   jerasure_matrix_decode only works when w = 8|16|32.
+
+   jerasure_make_decoding_matrix/bitmatrix make the k*k decoding matrix
+         (or wk*wk bitmatrix) by taking the rows corresponding to k
+         non-erased devices of the distribution matrix, and then
+         inverting that matrix.
+
+         You should already have allocated the decoding matrix and
+         dm_ids, which is a vector of k integers.  These will be
+         filled in appropriately.  dm_ids[i] is the id of element
+         i of the survivors vector.  I.e. row i of the decoding matrix
+         times dm_ids equals data drive i.
+
+         Both of these routines take "erased" instead of "erasures".
+         Erased is a vector with k+m elements, which has 0 or 1 for 
+         each device's id, according to whether the device is erased.
+ 
+   jerasure_erasures_to_erased allocates and returns erased from erasures.
+    
+ */
+
+int jerasure_matrix_decode(int k, int m, int w, 
+                          int *matrix, int row_k_ones, int *erasures,
+                          char **data_ptrs, char **coding_ptrs, int size);
+                          
+int jerasure_bitmatrix_decode(int k, int m, int w, 
+                            int *bitmatrix, int row_k_ones, int *erasures,
+                            char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures,
+                            char **data_ptrs, char **coding_ptrs, int size, int packetsize,
+                            int smart);
+
+int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures,
+                            char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased, 
+                                  int *decoding_matrix, int *dm_ids);
+
+int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased, 
+                                  int *decoding_matrix, int *dm_ids);
+
+int *jerasure_erasures_to_erased(int k, int m, int *erasures);
+
+/* ------------------------------------------------------------ */
+/* These perform dot products and schedules. -------------------*/
+/*
+   src_ids is a matrix of k id's (0 - k-1 for data devices, k - k+m-1
+   for coding devices) that identify the source devices.  Dest_id is
+   the id of the destination device.
+
+   jerasure_matrix_dotprod only works when w = 8|16|32.
+
+   jerasure_do_scheduled_operations executes the schedule on w*packetsize worth of
+   bytes from each device.  ptrs is an array of pointers which should have as many
+   elements as the highest referenced device in the schedule.
+
+ */
+ 
+void jerasure_matrix_dotprod(int k, int w, int *matrix_row,
+                          int *src_ids, int dest_id,
+                          char **data_ptrs, char **coding_ptrs, int size);
+
+void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row,
+                             int *src_ids, int dest_id,
+                             char **data_ptrs, char **coding_ptrs, int size, int packetsize);
+
+void jerasure_do_scheduled_operations(char **ptrs, int **schedule, int packetsize);
+
+/* ------------------------------------------------------------ */
+/* Matrix Inversion ------------------------------------------- */
+/*
+   The two matrix inversion functions work on rows*rows matrices of
+   ints.  If a bitmatrix, then each int will just be zero or one.
+   Otherwise, they will be elements of gf(2^w).  Obviously, you can
+   do bit matrices with crs_invert_matrix() and set w = 1, but
+   crs_invert_bitmatrix will be more efficient.
+
+   The two invertible functions return whether a matrix is invertible.
+   They are more efficient than the inverstion functions.
+
+   Mat will be destroyed when the matrix inversion or invertible
+   testing is done.  Sorry.
+
+   Inv must be allocated by the caller.
+
+   The two invert_matrix functions return 0 on success, and -1 if the
+   matrix is uninvertible.
+
+   The two invertible function simply return whether the matrix is
+   invertible.  (0 or 1). Mat will be destroyed.
+ */
+
+int jerasure_invert_matrix(int *mat, int *inv, int rows, int w);
+int jerasure_invert_bitmatrix(int *mat, int *inv, int rows);
+int jerasure_invertible_matrix(int *mat, int rows, int w);
+int jerasure_invertible_bitmatrix(int *mat, int rows);
+
+/* ------------------------------------------------------------ */
+/* Basic matrix operations -------------------------------------*/
+/*
+   Each of the print_matrix routines require a w.  In jerasure_print_matrix,
+   this is to calculate the field width.  In jerasure_print_bitmatrix, it is
+   to put spaces between the bits.
+
+   jerasure_matrix_multiply is a simple matrix multiplier in GF(2^w).  It returns a r1*c2
+   matrix, which is the product of the two input matrices.  It allocates
+   the product.  Obviously, c1 should equal r2.  However, this is not
+   validated by the procedure.  
+*/
+
+void jerasure_print_matrix(int *matrix, int rows, int cols, int w);
+void jerasure_print_bitmatrix(int *matrix, int rows, int cols, int w);
+
+
+int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w);
+
+/* ------------------------------------------------------------ */
+/* Stats ------------------------------------------------------ */
+/*
+  jerasure_get_stats fills in a vector of three doubles:
+
+      fill_in[0] is the number of bytes that have been XOR'd
+      fill_in[1] is the number of bytes that have been copied
+      fill_in[2] is the number of bytes that have been multiplied
+                 by a constant in GF(2^w)
+
+  When jerasure_get_stats() is called, it resets its values.
+ */
+
+void jerasure_get_stats(double *fill_in);
+
+int jerasure_autoconf_test();
+
+#endif
diff --git a/src/erasure-code/jerasure/jerasure/include/liberation.h b/src/erasure-code/jerasure/jerasure/include/liberation.h
new file mode 100644
index 0000000..f2fb723
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/include/liberation.h
@@ -0,0 +1,47 @@
+/* *
+ * Copyright (c) 2013, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _LIBERATION
+
+extern int *liberation_coding_bitmatrix(int k, int w);
+extern int *liber8tion_coding_bitmatrix(int k);
+extern int *blaum_roth_coding_bitmatrix(int k, int w);
+
+#endif
diff --git a/src/erasure-code/jerasure/jerasure/include/reed_sol.h b/src/erasure-code/jerasure/jerasure/include/reed_sol.h
new file mode 100644
index 0000000..d2d8fe8
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/include/reed_sol.h
@@ -0,0 +1,50 @@
+/* *
+ * Copyright (c) 2013, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+extern int *reed_sol_vandermonde_coding_matrix(int k, int m, int w);
+extern int *reed_sol_extended_vandermonde_matrix(int rows, int cols, int w);
+extern int *reed_sol_big_vandermonde_distribution_matrix(int rows, int cols, int w);
+
+extern int reed_sol_r6_encode(int k, int w, char **data_ptrs, char **coding_ptrs, int size);
+extern int *reed_sol_r6_coding_matrix(int k, int w);
+
+extern void reed_sol_galois_w08_region_multby_2(char *region, int nbytes);
+extern void reed_sol_galois_w16_region_multby_2(char *region, int nbytes);
+extern void reed_sol_galois_w32_region_multby_2(char *region, int nbytes);
diff --git a/src/erasure-code/jerasure/jerasure/src/cauchy.c b/src/erasure-code/jerasure/jerasure/src/cauchy.c
new file mode 100644
index 0000000..f63dfb7
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/src/cauchy.c
@@ -0,0 +1,405 @@
+/* *
+ * Copyright (c) 2014, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Jerasure's authors:
+
+   Revision 2.x - 2014: James S. Plank and Kevin M. Greenan
+   Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman.
+   Revision 1.0 - 2007: James S. Plank
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "galois.h"
+#include "jerasure.h"
+#include "cauchy.h"
+
+static int PPs[33] = { -1, -1, -1, -1, -1, -1, -1, -1,
+                       -1, -1, -1, -1, -1, -1, -1, -1,
+                       -1, -1, -1, -1, -1, -1, -1, -1,
+                       -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+static int NOs[33];
+static int ONEs[33][33];
+
+static int *cbest_0;
+static int *cbest_1;
+static int cbest_2[3];
+static int cbest_3[7];
+static int cbest_4[15];
+static int cbest_5[31];
+static int cbest_6[63];
+static int cbest_7[127];
+static int cbest_8[255];
+static int cbest_9[511];
+static int cbest_10[1023];
+static int cbest_11[1023];
+static int *cbest_12, *cbest_13, *cbest_14, *cbest_15, *cbest_16, *cbest_17, *cbest_18, *cbest_19, *cbest_20,
+           *cbest_21, *cbest_22, *cbest_23, *cbest_24, *cbest_25, *cbest_26, *cbest_27, *cbest_28, *cbest_29, *cbest_30,
+           *cbest_31, *cbest_32;
+
+static int cbest_max_k[33] = { -1, -1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 1023, -1,
+     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+     -1, -1, -1, -1 };
+
+static int cbest_init = 0;
+
+static int *cbest_all[33];
+
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+int cauchy_n_ones(int n, int w)
+{
+  int no;
+  int cno;
+  int nones;
+  int i, j;
+  int highbit;
+
+  highbit = (1 << (w-1));
+
+  if (PPs[w] == -1) {
+    nones = 0;
+    PPs[w] = galois_single_multiply(highbit, 2, w);
+    for (i = 0; i < w; i++) {
+      if (PPs[w] & (1 << i)) {
+        ONEs[w][nones] = (1 << i);
+        nones++;
+      }
+    }
+    NOs[w] = nones;
+  }
+
+  no = 0;
+  for (i = 0; i < w; i++) if (n & (1 << i)) no++;
+  cno = no;
+  for (i = 1; i < w; i++) {
+    if (n & highbit) {
+      n ^= highbit;
+      n <<= 1;
+      n ^= PPs[w];
+      cno--;
+      for (j = 0; j < NOs[w]; j++) {
+        cno += (n & ONEs[w][j]) ? 1 : -1;
+      }
+    } else {
+      n <<= 1;
+    } 
+    no += cno;
+  }
+  return no;
+}
+  
+int *cauchy_original_coding_matrix(int k, int m, int w)
+{
+  int *matrix;
+  int i, j, index;
+
+  if (w < 31 && (k+m) > (1 << w)) return NULL;
+  matrix = talloc(int, k*m);
+  if (matrix == NULL) return NULL;
+  index = 0;
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < k; j++) {
+      matrix[index] = galois_single_divide(1, (i ^ (m+j)), w);
+      index++;
+    }
+  }
+  return matrix;
+}
+
+int *cauchy_xy_coding_matrix(int k, int m, int w, int *X, int *Y)
+{
+  int index, i, j;
+  int *matrix;
+
+  matrix = talloc(int, k*m);
+  if (matrix == NULL) { return NULL; }
+  index = 0;
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < k; j++) {
+      matrix[index] = galois_single_divide(1, (X[i] ^ Y[j]), w);
+      index++;
+    }
+  }
+  return matrix;
+}
+
+void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix)
+{
+  int index, i, j, x;
+  int tmp;
+  int bno, tno, bno_index;
+
+  for (j = 0; j < k; j++) {
+    if (matrix[j] != 1) {
+      tmp = galois_single_divide(1, matrix[j], w);
+      index = j;
+      for (i = 0; i < m; i++) {
+        matrix[index] = galois_single_multiply(matrix[index], tmp, w);
+        index += k;
+      }
+    }
+  }
+  for (i = 1; i < m; i++) {
+    bno = 0;
+    index = i*k;
+    for (j = 0; j < k; j++) bno += cauchy_n_ones(matrix[index+j], w);
+    bno_index = -1;
+    for (j = 0; j < k; j++) {
+      if (matrix[index+j] != 1) {
+        tmp = galois_single_divide(1, matrix[index+j], w);
+        tno = 0;
+        for (x = 0; x < k; x++) {
+          tno += cauchy_n_ones(galois_single_multiply(matrix[index+x], tmp, w), w);
+        }
+        if (tno < bno) {
+          bno = tno;
+          bno_index = j;
+        }
+      }
+    }
+    if (bno_index != -1) {
+      tmp = galois_single_divide(1, matrix[index+bno_index], w);
+      for (j = 0; j < k; j++) {
+        matrix[index+j] = galois_single_multiply(matrix[index+j], tmp, w);
+      }
+    }
+  }
+}
+
+int *cauchy_good_general_coding_matrix(int k, int m, int w)
+{
+  int *matrix, i;
+
+  if (m == 2 && k <= cbest_max_k[w]) {
+    matrix = talloc(int, k*m);
+    if (matrix == NULL) return NULL;
+    if (!cbest_init) {
+      cbest_init = 1;
+      cbest_all[0] = cbest_0; cbest_all[1] = cbest_1; cbest_all[2] = cbest_2; cbest_all[3] = cbest_3; cbest_all[4] =
+      cbest_4; cbest_all[5] = cbest_5; cbest_all[6] = cbest_6; cbest_all[7] = cbest_7; cbest_all[8] = cbest_8;
+      cbest_all[9] = cbest_9; cbest_all[10] = cbest_10; cbest_all[11] = cbest_11; cbest_all[12] = cbest_12;
+      cbest_all[13] = cbest_13; cbest_all[14] = cbest_14; cbest_all[15] = cbest_15; cbest_all[16] = cbest_16;
+      cbest_all[17] = cbest_17; cbest_all[18] = cbest_18; cbest_all[19] = cbest_19; cbest_all[20] = cbest_20;
+      cbest_all[21] = cbest_21; cbest_all[22] = cbest_22; cbest_all[23] = cbest_23; cbest_all[24] = cbest_24;
+      cbest_all[25] = cbest_25; cbest_all[26] = cbest_26; cbest_all[27] = cbest_27; cbest_all[28] = cbest_28;
+      cbest_all[29] = cbest_29; cbest_all[30] = cbest_30; cbest_all[31] = cbest_31; cbest_all[32] = (int *) cbest_32;
+    }
+    for (i = 0; i < k; i++) {
+      matrix[i] = 1;
+      matrix[i+k] = cbest_all[w][i];
+    }
+    return matrix;
+  } else {
+    matrix = cauchy_original_coding_matrix(k, m, w);
+    if (matrix == NULL) return NULL;
+    cauchy_improve_coding_matrix(k, m, w, matrix);
+    return matrix;
+  }
+}
+
+static int cbest_2[3] = { 1, 2, 3 };
+static int cbest_3[7] = { 1, 2, 5, 4, 7, 3, 6 };
+
+static int cbest_4[15] = { 1, 2, 9, 4, 8, 13, 3, 6, 12, 5, 11, 15, 10, 14, 7 };
+
+static int cbest_5[31] = { 1, 2, 18, 4, 9, 8, 22, 16, 3, 11, 19, 5, 10, 6, 20, 27, 13, 23, 26, 12,
+    17, 25, 24, 31, 30, 7, 15, 21, 29, 14, 28 };
+
+static int cbest_6[63] = { 1, 2, 33, 4, 8, 49, 16, 32, 57, 3, 6, 12, 24, 48, 5, 35, 9, 37, 10, 17,
+    41, 51, 56, 61, 18, 28, 53, 14, 20, 34, 7, 13, 25, 36, 59, 26, 39, 40, 45, 50, 60, 52, 63,
+    11, 30, 55, 19, 22, 29, 43, 58, 15, 21, 38, 44, 47, 62, 27, 54, 42, 31, 23, 46 };
+
+static int cbest_7[127] = { 1, 2, 68, 4, 34, 8, 17, 16, 76, 32, 38, 3, 64, 69, 5, 19, 35, 70, 6, 9,
+    18, 102, 10, 36, 85, 12, 21, 42, 51, 72, 77, 84, 20, 25, 33, 50, 78, 98, 24, 39, 49, 100, 110
+   , 48, 65, 93, 40, 66, 71, 92, 7, 46, 55, 87, 96, 103, 106, 11, 23, 37, 54, 81, 86, 108, 13,
+    22, 27, 43, 53, 73, 80, 14, 26, 52, 74, 79, 99, 119, 44, 95, 101, 104, 111, 118, 29, 59, 89,
+    94, 117, 28, 41, 58, 67, 88, 115, 116, 47, 57, 83, 97, 107, 114, 127, 56, 82, 109, 113, 126,
+    112, 125, 15, 63, 75, 123, 124, 31, 45, 62, 91, 105, 122, 30, 61, 90, 121, 60, 120 };
+
+static int cbest_8[255] = { 1, 2, 142, 4, 71, 8, 70, 173, 3, 35, 143, 16, 17, 67, 134, 140, 172, 6, 34
+   , 69, 201, 216, 5, 33, 86, 12, 65, 138, 158, 159, 175, 10, 32, 43, 66, 108, 130, 193, 234, 9,
+    24, 25, 50, 68, 79, 100, 132, 174, 200, 217, 20, 21, 42, 48, 87, 169, 41, 54, 64, 84, 96, 117
+   , 154, 155, 165, 226, 77, 82, 135, 136, 141, 168, 192, 218, 238, 7, 18, 19, 39, 40, 78, 113,
+    116, 128, 164, 180, 195, 205, 220, 232, 14, 26, 27, 58, 109, 156, 157, 203, 235, 13, 28, 29, 38
+   , 51, 56, 75, 85, 90, 101, 110, 112, 139, 171, 11, 37, 49, 52, 76, 83, 102, 119, 131, 150, 151
+   , 167, 182, 184, 188, 197, 219, 224, 45, 55, 80, 94, 97, 133, 170, 194, 204, 221, 227, 236, 36,
+    47, 73, 92, 98, 104, 118, 152, 153, 166, 202, 207, 239, 251, 22, 23, 44, 74, 91, 148, 149, 161
+   , 181, 190, 233, 46, 59, 88, 137, 146, 147, 163, 196, 208, 212, 222, 250, 57, 81, 95, 106, 111,
+    129, 160, 176, 199, 243, 249, 15, 53, 72, 93, 103, 115, 125, 162, 183, 185, 189, 206, 225, 255,
+    186, 210, 230, 237, 242, 248, 30, 31, 62, 89, 99, 105, 114, 121, 124, 178, 209, 213, 223, 228,
+    241, 254, 60, 191, 198, 247, 120, 240, 107, 127, 144, 145, 177, 211, 214, 246, 245, 123, 126,
+    187, 231, 253, 63, 179, 229, 244, 61, 122, 215, 252 };
+
+static int cbest_9[511] = { 1, 2, 264, 4, 132, 8, 66, 16, 33, 32, 280, 64, 140, 128, 3, 70, 265, 5,
+    133, 256, 266, 6, 9, 35, 67, 134, 268, 396, 10, 17, 34, 330, 12, 18, 68, 198, 297, 20, 37, 74
+   , 136, 148, 165, 281, 296, 24, 36, 41, 65, 82, 99, 164, 272, 282, 388, 40, 49, 98, 141, 194,
+    284, 328, 412, 48, 97, 129, 142, 196, 346, 71, 72, 96, 130, 313, 392, 80, 206, 257, 267, 312,
+    334, 7, 135, 156, 173, 192, 258, 269, 397, 404, 11, 78, 144, 161, 172, 260, 270, 299, 331, 344,
+    398, 13, 19, 39, 69, 86, 103, 160, 167, 199, 202, 298, 322, 384, 14, 21, 38, 43, 75, 102, 137,
+    149, 166, 204, 289, 332, 408, 462, 22, 25, 42, 51, 83, 101, 138, 150, 273, 283, 288, 301, 350,
+    389, 429, 26, 50, 76, 100, 195, 274, 285, 300, 329, 363, 390, 413, 428, 28, 45, 84, 143, 197,
+    200, 214, 231, 276, 286, 315, 320, 347, 362, 414, 458, 44, 53, 73, 90, 107, 131, 152, 169, 181,
+    230, 314, 338, 361, 393, 400, 454, 460, 52, 57, 81, 106, 115, 168, 175, 180, 207, 229, 305, 335
+   , 348, 360, 394, 421, 478, 56, 105, 114, 157, 163, 174, 193, 210, 227, 228, 259, 304, 317, 326,
+    405, 420, 445, 79, 104, 113, 145, 158, 162, 212, 226, 261, 271, 316, 345, 379, 399, 406, 444,
+    450, 456, 87, 88, 112, 146, 203, 225, 262, 291, 323, 336, 378, 385, 425, 452, 474, 15, 205, 222
+   , 224, 239, 290, 303, 333, 367, 377, 386, 409, 424, 431, 463, 470, 476, 23, 139, 151, 189, 208,
+    238, 302, 324, 351, 366, 376, 410, 430, 437, 27, 47, 77, 94, 111, 177, 188, 237, 275, 293, 342,
+    365, 391, 436, 448, 29, 46, 55, 85, 110, 119, 171, 176, 183, 201, 215, 218, 235, 236, 277, 287,
+    292, 321, 355, 364, 415, 417, 459, 466, 472, 30, 54, 59, 91, 109, 118, 153, 170, 182, 220, 234,
+    278, 307, 339, 354, 401, 416, 423, 441, 455, 461, 468, 495, 58, 108, 117, 154, 233, 306, 319,
+    349, 353, 383, 395, 402, 422, 440, 447, 479, 494, 92, 116, 211, 232, 318, 327, 340, 352, 382,
+    446, 493, 61, 159, 213, 216, 247, 309, 381, 407, 427, 451, 457, 464, 491, 492, 60, 89, 123, 147
+   , 185, 246, 263, 308, 337, 371, 380, 426, 433, 453, 475, 487, 490, 122, 184, 191, 223, 245, 370,
+    387, 432, 439, 471, 477, 486, 489, 511, 121, 179, 190, 209, 243, 244, 295, 325, 359, 369, 411,
+    438, 485, 488, 510, 95, 120, 178, 242, 294, 343, 358, 368, 419, 449, 483, 484, 509, 219, 241,
+    357, 418, 443, 467, 473, 482, 507, 508, 31, 221, 240, 255, 279, 356, 442, 469, 481, 503, 506,
+    155, 254, 403, 480, 502, 505, 63, 93, 127, 253, 311, 341, 375, 501, 504, 62, 126, 187, 217, 251
+   , 252, 310, 374, 435, 465, 499, 500, 125, 186, 250, 373, 434, 498, 124, 249, 372, 497, 248, 496
+    };
+
+static int cbest_10[1023] = { 1, 2, 516, 4, 258, 8, 129, 16, 32, 580, 64, 128, 290, 145, 256, 3, 512,
+    517, 5, 259, 518, 588, 6, 9, 18, 36, 72, 144, 774, 10, 17, 131, 262, 288, 524, 645, 12, 33,
+    133, 266, 294, 387, 532, 576, 581, 20, 34, 65, 137, 274, 548, 582, 24, 66, 291, 838, 40, 68,
+    130, 147, 161, 322, 644, 709, 806, 48, 132, 193, 257, 386, 596, 80, 136, 298, 419, 612, 661, 772
+   , 96, 149, 260, 272, 306, 403, 513, 146, 153, 160, 264, 292, 385, 514, 519, 544, 584, 589, 708,
+    870, 7, 19, 37, 73, 192, 354, 590, 770, 775, 11, 38, 74, 177, 263, 289, 418, 520, 525, 534, 641
+   , 660, 725, 802, 836, 846, 13, 22, 76, 148, 209, 267, 295, 320, 330, 402, 526, 528, 533, 577,
+    647, 717, 804, 14, 21, 26, 35, 44, 135, 152, 165, 201, 275, 304, 384, 401, 435, 549, 578, 583,
+    604, 608, 782, 903, 25, 52, 67, 88, 139, 270, 296, 391, 417, 550, 620, 653, 790, 834, 839, 41,
+    50, 69, 104, 141, 176, 278, 302, 323, 395, 423, 540, 598, 640, 705, 724, 807, 866, 28, 42, 49,
+    70, 82, 100, 163, 208, 282, 310, 556, 592, 597, 646, 663, 677, 711, 716, 868, 878, 81, 134, 151
+   , 164, 195, 200, 299, 326, 352, 362, 400, 434, 564, 613, 657, 768, 773, 902, 967, 97, 138, 155,
+    169, 197, 261, 273, 307, 358, 390, 416, 433, 451, 614, 652, 733, 800, 814, 844, 854, 935, 56, 84
+   , 98, 140, 181, 217, 265, 293, 328, 338, 394, 422, 515, 545, 585, 704, 788, 822, 871, 919, 162,
+    179, 276, 355, 407, 427, 546, 586, 591, 616, 662, 669, 676, 710, 727, 741, 771, 780, 901, 39, 75
+   , 150, 157, 194, 211, 225, 268, 280, 308, 314, 389, 411, 439, 521, 530, 535, 628, 656, 721, 803,
+    832, 837, 842, 847, 966, 23, 77, 112, 154, 168, 196, 300, 321, 331, 393, 421, 432, 450, 522, 527
+   , 529, 552, 606, 643, 673, 693, 713, 732, 805, 864, 874, 934, 999, 15, 27, 45, 54, 78, 90, 108,
+    180, 216, 305, 483, 560, 579, 600, 605, 609, 719, 778, 783, 852, 876, 886, 899, 918, 983, 46, 53
+   , 89, 167, 178, 185, 203, 213, 271, 297, 324, 334, 336, 360, 370, 406, 426, 467, 542, 551, 610,
+    621, 649, 668, 726, 740, 786, 791, 810, 820, 835, 900, 917, 931, 951, 965, 975, 30, 51, 105, 156
+   , 205, 210, 224, 279, 303, 356, 366, 388, 405, 410, 438, 449, 459, 536, 541, 594, 599, 622, 655,
+    720, 812, 818, 862, 867, 933, 29, 43, 71, 83, 92, 101, 106, 143, 173, 283, 311, 312, 346, 392,
+    409, 420, 437, 443, 557, 566, 593, 642, 659, 672, 692, 707, 712, 737, 757, 869, 879, 911, 998,
+    60, 102, 241, 327, 353, 363, 399, 425, 482, 558, 565, 624, 679, 718, 735, 749, 769, 798, 898,
+    963, 982, 58, 86, 166, 183, 184, 202, 212, 219, 233, 286, 359, 431, 466, 615, 636, 648, 689, 729
+   , 801, 815, 840, 845, 850, 855, 884, 916, 930, 950, 964, 974, 981, 995, 1015, 57, 85, 99, 120,
+    171, 199, 204, 229, 318, 329, 339, 368, 404, 448, 458, 465, 499, 654, 671, 685, 784, 789, 823,
+    872, 882, 915, 932, 949, 997, 1007, 116, 142, 159, 172, 277, 408, 436, 442, 455, 481, 491, 547,
+    572, 587, 617, 630, 658, 665, 706, 723, 736, 756, 776, 781, 816, 860, 894, 897, 910, 947, 991,
+    114, 221, 240, 269, 281, 309, 315, 332, 342, 344, 378, 398, 424, 441, 475, 487, 531, 618, 629,
+    678, 695, 734, 743, 748, 808, 833, 843, 929, 943, 962, 973, 113, 182, 189, 218, 227, 232, 301,
+    364, 374, 430, 457, 523, 553, 562, 602, 607, 688, 728, 753, 796, 830, 865, 875, 927, 980, 994,
+    1014, 55, 79, 91, 109, 170, 187, 198, 215, 228, 284, 415, 464, 498, 554, 561, 601, 670, 675, 684
+   , 715, 745, 765, 779, 848, 853, 877, 887, 909, 914, 948, 979, 996, 1006, 1013, 47, 110, 158, 249
+   , 316, 325, 335, 337, 361, 371, 397, 447, 454, 480, 490, 497, 538, 543, 611, 632, 664, 722, 787,
+    811, 821, 880, 896, 913, 946, 961, 971, 990, 1011, 31, 94, 220, 245, 357, 367, 429, 440, 474,
+    486, 537, 595, 623, 651, 681, 694, 701, 742, 759, 813, 819, 858, 863, 892, 928, 942, 945, 972,
+    989, 993, 1003, 1023, 62, 93, 107, 188, 207, 226, 237, 243, 313, 340, 347, 376, 456, 471, 473,
+    507, 567, 568, 626, 752, 890, 907, 926, 1005, 61, 103, 124, 175, 186, 214, 372, 414, 453, 463,
+    489, 503, 559, 625, 638, 674, 691, 714, 731, 739, 744, 764, 794, 799, 828, 908, 925, 939, 959,
+    978, 1012, 59, 87, 122, 248, 287, 350, 396, 413, 446, 485, 495, 496, 637, 751, 826, 841, 851,
+    885, 912, 941, 960, 970, 977, 1010, 118, 121, 235, 244, 319, 369, 382, 428, 445, 574, 650, 667,
+    680, 700, 758, 761, 785, 873, 883, 944, 988, 992, 1002, 1009, 1022, 117, 206, 223, 231, 236, 242
+   , 470, 472, 506, 573, 631, 687, 777, 817, 856, 861, 895, 906, 987, 1004, 1021, 115, 174, 191, 333
+   , 343, 345, 379, 452, 462, 469, 488, 502, 505, 619, 690, 697, 730, 738, 755, 809, 888, 924, 938,
+    958, 969, 1019, 253, 365, 375, 412, 484, 494, 501, 563, 603, 750, 767, 792, 797, 831, 923, 940,
+    957, 976, 1001, 234, 251, 285, 348, 444, 479, 555, 634, 666, 760, 824, 849, 905, 955, 1008, 111,
+    222, 230, 247, 317, 380, 461, 511, 539, 633, 686, 703, 747, 881, 937, 986, 1020, 95, 190, 468,
+    493, 504, 570, 696, 754, 859, 893, 968, 985, 1018, 63, 126, 252, 341, 377, 500, 569, 627, 683,
+    766, 891, 922, 956, 1000, 1017, 125, 239, 250, 373, 478, 639, 795, 829, 904, 921, 954, 123, 246,
+    351, 460, 477, 510, 702, 746, 763, 827, 936, 953, 119, 383, 492, 509, 575, 984, 682, 699, 857,
+    1016, 238, 255, 889, 920, 476, 762, 793, 952, 349, 508, 635, 825, 381, 698, 254, 571, 127 };
+
+static int cbest_11[1023] = { 1,
+    2, 1026, 4, 513, 8, 16, 1282, 32, 64, 641, 128, 256, 512, 1346, 1024, 3, 673, 1027, 5, 10, 20, 40, 80, 160, 320,
+    640, 6, 9, 515, 1030, 1280, 1539, 17, 517, 1034, 1283, 12, 18, 33, 521, 1042, 1362, 34, 65, 529, 1058, 1286, 1795,
+    24, 36, 66, 129, 545, 643, 1090, 1290, 1667, 68, 130, 257, 577, 645, 672, 1154, 1298, 1344, 48, 72, 132, 258, 336,
+    649, 681, 1314, 1347, 136, 168, 260, 514, 657, 769, 1538, 1923, 84, 96, 144, 264, 516, 1025, 1350, 1410, 1859, 42,
+    272, 520, 705, 1032, 1354, 11, 21, 41, 81, 161, 192, 288, 321, 528, 675, 1028, 1537, 1699, 1794, 7, 22, 82, 162,
+    322, 544, 642, 677, 897, 1031, 1046, 1066, 1106, 1186, 1281, 1366, 1378, 1666, 14, 44, 164, 324, 384, 523, 533,
+    553, 576, 593, 644, 833, 1035, 1040, 1288, 1360, 1987, 13, 19, 28, 88, 328, 519, 648, 680, 689, 1043, 1056, 1284,
+    1363, 1474, 1543, 1793, 1955, 26, 35, 56, 176, 656, 768, 1038, 1059, 1088, 1287, 1302, 1322, 1442, 1547, 1665,
+    1922, 25, 37, 52, 67, 112, 340, 352, 525, 531, 737, 1091, 1152, 1291, 1296, 1555, 1858, 1875, 38, 69, 74, 104, 131,
+    224, 547, 651, 661, 683, 704, 721, 961, 1050, 1062, 1155, 1299, 1312, 1345, 1370, 1571, 1799, 49, 70, 73, 133, 138,
+    148, 170, 208, 259, 337, 448, 537, 549, 579, 647, 674, 929, 1094, 1294, 1315, 1352, 1536, 1603, 1671, 1698, 1803,
+    1921, 50, 134, 137, 169, 261, 266, 276, 296, 338, 416, 581, 676, 896, 1074, 1098, 1158, 1348, 1394, 1408, 1675,
+    1707, 1811, 1857, 2019, 76, 85, 97, 145, 262, 265, 522, 532, 552, 561, 585, 592, 653, 659, 685, 771, 832, 849,
+    1064, 1162, 1194, 1306, 1318, 1351, 1386, 1411, 1506, 1683, 1827, 1986, 2003, 43, 86, 98, 140, 146, 172, 273, 344,
+    518, 688, 773, 1033, 1110, 1122, 1170, 1355, 1490, 1542, 1697, 1792, 1927, 1954, 100, 193, 268, 274, 289, 597, 609,
+    665, 697, 707, 777, 1029, 1044, 1104, 1184, 1330, 1364, 1376, 1414, 1546, 1664, 1731, 1863, 1931, 1963, 23, 46, 83,
+    92, 152, 163, 184, 194, 290, 323, 368, 524, 530, 555, 693, 709, 736, 753, 785, 993, 1036, 1047, 1067, 1107, 1187,
+    1218, 1320, 1358, 1367, 1379, 1418, 1450, 1545, 1554, 1867, 1874, 1939, 1985, 15, 30, 45, 60, 90, 120, 165, 180,
+    196, 240, 280, 292, 325, 330, 360, 385, 480, 546, 650, 660, 679, 682, 713, 720, 745, 801, 899, 960, 977, 1041,
+    1289, 1361, 1426, 1472, 1541, 1570, 1703, 1798, 1953, 29, 58, 89, 116, 166, 200, 232, 326, 329, 386, 464, 535, 536,
+    548, 578, 595, 646, 835, 901, 928, 1048, 1057, 1070, 1190, 1285, 1300, 1368, 1382, 1440, 1475, 1559, 1579, 1602,
+    1619, 1670, 1802, 1879, 1891, 1920, 27, 57, 177, 304, 388, 527, 557, 580, 691, 725, 837, 905, 937, 1039, 1054,
+    1089, 1114, 1292, 1303, 1323, 1374, 1443, 1553, 1674, 1706, 1715, 1801, 1810, 1856, 1873, 1991, 2018, 2035, 53,
+    106, 113, 178, 212, 332, 341, 353, 392, 424, 541, 560, 584, 601, 652, 658, 684, 770, 841, 848, 913, 1060, 1082,
+    1096, 1153, 1202, 1297, 1402, 1478, 1522, 1569, 1673, 1682, 1705, 1797, 1826, 1959, 1995, 2002, 2027, 39, 54, 75,
+    105, 114, 225, 342, 354, 400, 539, 569, 739, 772, 1051, 1063, 1078, 1092, 1138, 1160, 1192, 1304, 1313, 1326, 1371,
+    1384, 1398, 1446, 1482, 1514, 1551, 1601, 1669, 1696, 1763, 1815, 1835, 1926, 71, 139, 149, 171, 209, 226, 298,
+    356, 449, 565, 596, 608, 625, 663, 664, 696, 706, 723, 741, 776, 853, 865, 963, 1072, 1095, 1130, 1156, 1250, 1295,
+    1310, 1353, 1392, 1687, 1730, 1747, 1809, 1862, 1930, 1962, 1971, 2007, 2017, 51, 78, 108, 135, 150, 210, 228, 267,
+    277, 297, 339, 348, 417, 450, 551, 554, 587, 617, 655, 687, 692, 708, 752, 784, 931, 965, 992, 1009, 1075, 1099,
+    1159, 1174, 1234, 1316, 1338, 1349, 1395, 1409, 1458, 1494, 1504, 1544, 1563, 1575, 1681, 1825, 1866, 1883, 1929,
+    1938, 1961, 1984, 2001, 77, 142, 174, 263, 278, 346, 376, 418, 452, 496, 583, 669, 678, 701, 712, 729, 744, 761,
+    800, 898, 933, 969, 976, 1001, 1065, 1108, 1120, 1163, 1168, 1195, 1307, 1319, 1334, 1356, 1387, 1416, 1448, 1488,
+    1507, 1540, 1607, 1702, 1807, 1865, 1925, 1952, 87, 99, 141, 147, 156, 173, 188, 216, 248, 270, 300, 345, 372, 420,
+    456, 488, 534, 563, 594, 667, 699, 757, 779, 789, 809, 834, 851, 900, 1102, 1111, 1123, 1171, 1328, 1412, 1491,
+    1558, 1578, 1587, 1611, 1618, 1679, 1711, 1729, 1861, 1878, 1890, 1907, 1943, 2023, 94, 101, 124, 154, 186, 244,
+    269, 275, 284, 526, 556, 589, 690, 724, 775, 836, 904, 936, 945, 981, 1045, 1068, 1105, 1166, 1185, 1198, 1216,
+    1331, 1365, 1377, 1390, 1415, 1430, 1510, 1552, 1577, 1714, 1800, 1819, 1831, 1872, 1899, 1937, 1990, 2034, 47, 62,
+    93, 102, 122, 153, 185, 195, 282, 291, 312, 362, 369, 432, 468, 540, 599, 600, 611, 715, 747, 840, 857, 912, 1037,
+    1052, 1112, 1126, 1219, 1321, 1359, 1372, 1419, 1424, 1451, 1568, 1623, 1635, 1672, 1691, 1701, 1704, 1723, 1796,
+    1958, 1994, 2011, 2026, 2043, 31, 61, 91, 121, 181, 197, 202, 234, 241, 281, 293, 308, 331, 361, 370, 481, 538,
+    568, 613, 695, 711, 738, 755, 781, 787, 995, 1080, 1118, 1178, 1188, 1210, 1380, 1400, 1427, 1473, 1498, 1530,
+    1550, 1557, 1600, 1617, 1668, 1719, 1735, 1762, 1779, 1814, 1834, 1843, 1877, 1889, 1935, 1967, 1993, 2025, 2039,
+    59, 117, 167, 182, 198, 201, 233, 242, 294, 327, 387, 465, 482, 559, 564, 605, 624, 662, 722, 740, 803, 852, 864,
+    881, 907, 917, 939, 962, 979, 997, 1049, 1071, 1086, 1146, 1191, 1206, 1222, 1266, 1301, 1324, 1369, 1383, 1406,
+    1422, 1441, 1454, 1480, 1512, 1526, 1549, 1686, 1713, 1739, 1746, 1771, 1808, 1833, 1871, 1970, 1989, 2006, 2016,
+    2033, 118, 305, 334, 364, 389, 394, 404, 426, 466, 484, 543, 550, 573, 586, 603, 616, 633, 654, 686, 717, 749, 793,
+    805, 843, 873, 903, 930, 964, 1008, 1055, 1115, 1128, 1142, 1200, 1226, 1258, 1293, 1308, 1375, 1476, 1520, 1562,
+    1574, 1680, 1824 };
+
diff --git a/src/erasure-code/jerasure/jerasure/src/galois.c b/src/erasure-code/jerasure/jerasure/src/galois.c
new file mode 100644
index 0000000..82702db
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/src/galois.c
@@ -0,0 +1,365 @@
+/* *
+ * Copyright (c) 2014, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Jerasure's authors:
+
+   Revision 2.x - 2014: James S. Plank and Kevin M. Greenan
+   Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman.
+   Revision 1.0 - 2007: James S. Plank
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "galois.h"
+
+#define MAX_GF_INSTANCES 64
+gf_t *gfp_array[MAX_GF_INSTANCES] = { 0 };
+int  gfp_is_composite[MAX_GF_INSTANCES] = { 0 };
+
+gf_t *galois_get_field_ptr(int w)
+{
+  if (gfp_array[w] != NULL) {
+    return gfp_array[w];
+  }
+
+  return NULL;
+}
+
+gf_t* galois_init_field(int w,
+                        int mult_type,
+                        int region_type,
+                        int divide_type,
+                        uint64_t prim_poly,
+                        int arg1,
+                        int arg2)
+{
+  int scratch_size;
+  void *scratch_memory;
+  gf_t *gfp;
+
+  if (w <= 0 || w > 32) {
+    fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w);
+    assert(0);
+  }
+
+  gfp = (gf_t *) malloc(sizeof(gf_t));
+  if (!gfp) {
+    fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w);
+    assert(0);
+  }
+
+  scratch_size = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+  if (!scratch_size) {
+    fprintf(stderr, "ERROR -- cannot get scratch size for base field w=%d\n", w);
+    assert(0);
+  }
+
+  scratch_memory = malloc(scratch_size);
+  if (!scratch_memory) {
+    fprintf(stderr, "ERROR -- cannot get scratch memory for base field w=%d\n", w);
+    assert(0);
+  }
+
+  if(!gf_init_hard(gfp,
+                   w, 
+                   mult_type, 
+                   region_type, 
+                   divide_type, 
+                   prim_poly, 
+                   arg1, 
+                   arg2, 
+                   NULL, 
+                   scratch_memory))
+  {
+    fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w);
+    assert(0);
+  }
+
+  gfp_is_composite[w] = 0;
+  return gfp;
+}
+
+gf_t* galois_init_composite_field(int w,
+                                int region_type,
+                                int divide_type,
+                                int degree,
+                                gf_t* base_gf)
+{
+  int scratch_size;
+  void *scratch_memory;
+  gf_t *gfp;
+  
+  if (w <= 0 || w > 32) {
+    fprintf(stderr, "ERROR -- cannot init composite field for w=%d\n", w);
+    assert(0);
+  }
+  
+  gfp = (gf_t *) malloc(sizeof(gf_t));
+  if (!gfp) {
+    fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w);
+    assert(0);
+  }
+
+  scratch_size = gf_scratch_size(w, GF_MULT_COMPOSITE, region_type, divide_type, degree, 0);
+  if (!scratch_size) {
+    fprintf(stderr, "ERROR -- cannot get scratch size for composite field w=%d\n", w);
+    assert(0);
+  }
+
+  scratch_memory = malloc(scratch_size);
+  if (!scratch_memory) {
+    fprintf(stderr, "ERROR -- cannot get scratch memory for composite field w=%d\n", w);
+    assert(0);
+  }
+
+  if(!gf_init_hard(gfp,
+                   w,
+                   GF_MULT_COMPOSITE,
+                   region_type,
+                   divide_type,
+                   0, 
+                   degree, 
+                   0, 
+                   base_gf,
+                   scratch_memory))
+  {
+    fprintf(stderr, "ERROR -- cannot init default composite field for w=%d\n", w);
+    assert(0);
+  }
+  gfp_is_composite[w] = 1;
+  return gfp;
+}
+
+int galois_init_default_field(int w)
+{
+  if (gfp_array[w] == NULL) {
+    gfp_array[w] = (gf_t*)malloc(sizeof(gf_t));
+    if(gfp_array[w] == NULL)
+      return ENOMEM;
+    if (!gf_init_easy(gfp_array[w], w))
+      return EINVAL;
+  }
+  return 0;
+}
+
+static void galois_init(int w)
+{
+  if (w <= 0 || w > 32) {
+    fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w);
+    assert(0);
+  }
+
+  switch (galois_init_default_field(w)) {
+  case ENOMEM:
+    fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w);
+    assert(0);
+    break;
+  case EINVAL:
+    fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w);
+    assert(0);
+    break;
+  }
+}
+
+
+static int is_valid_gf(gf_t *gf, int w)
+{
+  // TODO: I assume we may eventually
+  // want to do w=64 and 128, so w
+  // will be needed to perform this check
+  (void)w;
+
+  if (gf == NULL) {
+    return 0;
+  }
+  if (gf->multiply.w32 == NULL) {
+    return 0;
+  }
+  if (gf->multiply_region.w32 == NULL) {
+    return 0;
+  }
+  if (gf->divide.w32 == NULL) {
+    return 0;
+  }
+  if (gf->inverse.w32 == NULL) {
+    return 0;
+  }
+  if (gf->extract_word.w32 == NULL) {
+    return 0;
+  }
+
+  return 1;
+}
+
+void galois_change_technique(gf_t *gf, int w)
+{
+  if (w <= 0 || w > 32) {
+    fprintf(stderr, "ERROR -- cannot support Galois field for w=%d\n", w);
+    assert(0);
+  }
+
+  if (!is_valid_gf(gf, w)) {
+    fprintf(stderr, "ERROR -- overriding with invalid Galois field for w=%d\n", w);
+    assert(0);
+  }
+
+  if (gfp_array[w] != NULL) {
+    gf_free(gfp_array[w], gfp_is_composite[w]);
+  }
+
+  gfp_array[w] = gf;
+}
+
+int galois_single_multiply(int x, int y, int w)
+{
+  if (x == 0 || y == 0) return 0;
+  
+  if (gfp_array[w] == NULL) {
+    galois_init(w);
+  }
+
+  if (w <= 32) {
+    return gfp_array[w]->multiply.w32(gfp_array[w], x, y);
+  } else {
+    fprintf(stderr, "ERROR -- Galois field not implemented for w=%d\n", w);
+    return 0;
+  }
+}
+
+int galois_single_divide(int x, int y, int w)
+{
+  if (x == 0) return 0;
+  if (y == 0) return -1;
+
+  if (gfp_array[w] == NULL) {
+    galois_init(w);
+  }
+
+  if (w <= 32) {
+    return gfp_array[w]->divide.w32(gfp_array[w], x, y);
+  } else {
+    fprintf(stderr, "ERROR -- Galois field not implemented for w=%d\n", w);
+    return 0;
+  }
+}
+
+void galois_w08_region_multiply(char *region,      /* Region to multiply */
+                                  int multby,       /* Number to multiply by */
+                                  int nbytes,        /* Number of bytes in region */
+                                  char *r2,          /* If r2 != NULL, products go here */
+                                  int add)
+{
+  if (gfp_array[8] == NULL) {
+    galois_init(8);
+  }
+  gfp_array[8]->multiply_region.w32(gfp_array[8], region, r2, multby, nbytes, add);
+}
+
+void galois_w16_region_multiply(char *region,      /* Region to multiply */
+                                  int multby,       /* Number to multiply by */
+                                  int nbytes,        /* Number of bytes in region */
+                                  char *r2,          /* If r2 != NULL, products go here */
+                                  int add)
+{
+  if (gfp_array[16] == NULL) {
+    galois_init(16);
+  }
+  gfp_array[16]->multiply_region.w32(gfp_array[16], region, r2, multby, nbytes, add);
+}
+
+
+void galois_w32_region_multiply(char *region,      /* Region to multiply */
+                                  int multby,       /* Number to multiply by */
+                                  int nbytes,        /* Number of bytes in region */
+                                  char *r2,          /* If r2 != NULL, products go here */
+                                  int add)
+{
+  if (gfp_array[32] == NULL) {
+    galois_init(32);
+  }
+  gfp_array[32]->multiply_region.w32(gfp_array[32], region, r2, multby, nbytes, add);
+}
+
+void galois_w8_region_xor(void *src, void *dest, int nbytes)
+{
+  if (gfp_array[8] == NULL) {
+    galois_init(8);
+  }
+  gfp_array[8]->multiply_region.w32(gfp_array[32], src, dest, 1, nbytes, 1);
+}
+
+void galois_w16_region_xor(void *src, void *dest, int nbytes)
+{
+  if (gfp_array[16] == NULL) {
+    galois_init(16);
+  }
+  gfp_array[16]->multiply_region.w32(gfp_array[16], src, dest, 1, nbytes, 1);
+}
+
+void galois_w32_region_xor(void *src, void *dest, int nbytes)
+{
+  if (gfp_array[32] == NULL) {
+    galois_init(32);
+  }
+  gfp_array[32]->multiply_region.w32(gfp_array[32], src, dest, 1, nbytes, 1);
+}
+
+void galois_region_xor(char *src, char *dest, int nbytes)
+{
+  if (nbytes >= 16) {
+    galois_w32_region_xor(src, dest, nbytes);
+  } else {
+    int i = 0;
+    for (i = 0; i < nbytes; i++) {
+      *dest ^= *src;
+      dest++;
+      src++;
+    } 
+  }
+}
+
+int galois_inverse(int y, int w)
+{
+  if (y == 0) return -1;
+  return galois_single_divide(1, y, w);
+}
diff --git a/src/erasure-code/jerasure/jerasure/src/jerasure.c b/src/erasure-code/jerasure/jerasure/src/jerasure.c
new file mode 100644
index 0000000..4297653
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/src/jerasure.c
@@ -0,0 +1,1388 @@
+/* *
+ * Copyright (c) 2014, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Jerasure's authors:
+
+   Revision 2.x - 2014: James S. Plank and Kevin M. Greenan
+   Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman.
+   Revision 1.0 - 2007: James S. Plank
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "galois.h"
+#include "jerasure.h"
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+static double jerasure_total_xor_bytes = 0;
+static double jerasure_total_gf_bytes = 0;
+static double jerasure_total_memcpy_bytes = 0;
+
+void jerasure_print_matrix(int *m, int rows, int cols, int w)
+{
+  int i, j;
+  int fw;
+  char s[30];
+  unsigned int w2;
+
+  if (w == 32) {
+    fw = 10;
+  } else {
+    w2 = (1 << w);
+    sprintf(s, "%u", w2-1);
+    fw = strlen(s);
+  }
+
+  for (i = 0; i < rows; i++) {
+    for (j = 0; j < cols; j++) {
+      if (j != 0) printf(" ");
+      printf("%*u", fw, m[i*cols+j]); 
+    }
+    printf("\n");
+  }
+}
+
+void jerasure_print_bitmatrix(int *m, int rows, int cols, int w)
+{
+  int i, j;
+
+  for (i = 0; i < rows; i++) {
+    if (i != 0 && i%w == 0) printf("\n");
+    for (j = 0; j < cols; j++) {
+      if (j != 0 && j%w == 0) printf(" ");
+      printf("%d", m[i*cols+j]); 
+    }
+    printf("\n");
+  }
+}
+
+int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids)
+{
+  int i, j, *tmpmat;
+
+  j = 0;
+  for (i = 0; j < k; i++) {
+    if (erased[i] == 0) {
+      dm_ids[j] = i;
+      j++;
+    }
+  }
+
+  tmpmat = talloc(int, k*k);
+  if (tmpmat == NULL) { return -1; }
+  for (i = 0; i < k; i++) {
+    if (dm_ids[i] < k) {
+      for (j = 0; j < k; j++) tmpmat[i*k+j] = 0;
+      tmpmat[i*k+dm_ids[i]] = 1;
+    } else {
+      for (j = 0; j < k; j++) {
+        tmpmat[i*k+j] = matrix[(dm_ids[i]-k)*k+j];
+      }
+    }
+  }
+
+  i = jerasure_invert_matrix(tmpmat, decoding_matrix, k, w);
+  free(tmpmat);
+  return i;
+}
+
+/* Internal Routine */
+int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids)
+{
+  int i, j, *tmpmat;
+  int index, mindex;
+
+  j = 0;
+  for (i = 0; j < k; i++) {
+    if (erased[i] == 0) {
+      dm_ids[j] = i;
+      j++;
+    }
+  }
+
+  tmpmat = talloc(int, k*k*w*w);
+  if (tmpmat == NULL) { return -1; }
+  for (i = 0; i < k; i++) {
+    if (dm_ids[i] < k) {
+      index = i*k*w*w;
+      for (j = 0; j < k*w*w; j++) tmpmat[index+j] = 0;
+      index = i*k*w*w+dm_ids[i]*w;
+      for (j = 0; j < w; j++) {
+        tmpmat[index] = 1;
+        index += (k*w+1);
+      }
+    } else {
+      index = i*k*w*w;
+      mindex = (dm_ids[i]-k)*k*w*w;
+      for (j = 0; j < k*w*w; j++) {
+        tmpmat[index+j] = matrix[mindex+j];
+      }
+    }
+  }
+
+  i = jerasure_invert_bitmatrix(tmpmat, decoding_matrix, k*w);
+  free(tmpmat);
+  return i;
+}
+
+int jerasure_matrix_decode(int k, int m, int w, int *matrix, int row_k_ones, int *erasures,
+                          char **data_ptrs, char **coding_ptrs, int size)
+{
+  int i, edd, lastdrive;
+  int *tmpids;
+  int *erased, *decoding_matrix, *dm_ids;
+
+  if (w != 8 && w != 16 && w != 32) return -1;
+
+  erased = jerasure_erasures_to_erased(k, m, erasures);
+  if (erased == NULL) return -1;
+
+  /* Find the number of data drives failed */
+
+  lastdrive = k;
+
+  edd = 0;
+  for (i = 0; i < k; i++) {
+    if (erased[i]) {
+      edd++;
+      lastdrive = i;
+    }
+  }
+    
+  /* You only need to create the decoding matrix in the following cases:
+
+      1. edd > 0 and row_k_ones is false.
+      2. edd > 0 and row_k_ones is true and coding device 0 has been erased.
+      3. edd > 1
+
+      We're going to use lastdrive to denote when to stop decoding data.
+      At this point in the code, it is equal to the last erased data device.
+      However, if we can't use the parity row to decode it (i.e. row_k_ones=0
+         or erased[k] = 1, we're going to set it to k so that the decoding 
+         pass will decode all data.
+   */
+
+  if (!row_k_ones || erased[k]) lastdrive = k;
+
+  dm_ids = NULL;
+  decoding_matrix = NULL;
+
+  if (edd > 1 || (edd > 0 && (!row_k_ones || erased[k]))) {
+    dm_ids = talloc(int, k);
+    if (dm_ids == NULL) {
+      free(erased);
+      return -1;
+    }
+
+    decoding_matrix = talloc(int, k*k);
+    if (decoding_matrix == NULL) {
+      free(erased);
+      free(dm_ids);
+      return -1;
+    }
+
+    if (jerasure_make_decoding_matrix(k, m, w, matrix, erased, decoding_matrix, dm_ids) < 0) {
+      free(erased);
+      free(dm_ids);
+      free(decoding_matrix);
+      return -1;
+    }
+  }
+
+  /* Decode the data drives.  
+     If row_k_ones is true and coding device 0 is intact, then only decode edd-1 drives.
+     This is done by stopping at lastdrive.
+     We test whether edd > 0 so that we can exit the loop early if we're done.
+   */
+
+  for (i = 0; edd > 0 && i < lastdrive; i++) {
+    if (erased[i]) {
+      jerasure_matrix_dotprod(k, w, decoding_matrix+(i*k), dm_ids, i, data_ptrs, coding_ptrs, size);
+      edd--;
+    }
+  }
+
+  /* Then if necessary, decode drive lastdrive */
+
+  if (edd > 0) {
+    tmpids = talloc(int, k);
+    for (i = 0; i < k; i++) {
+      tmpids[i] = (i < lastdrive) ? i : i+1;
+    }
+    jerasure_matrix_dotprod(k, w, matrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size);
+    free(tmpids);
+  }
+  
+  /* Finally, re-encode any erased coding devices */
+
+  for (i = 0; i < m; i++) {
+    if (erased[k+i]) {
+      jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, i+k, data_ptrs, coding_ptrs, size);
+    }
+  }
+
+  free(erased);
+  if (dm_ids != NULL) free(dm_ids);
+  if (decoding_matrix != NULL) free(decoding_matrix);
+
+  return 0;
+}
+
+
+int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix) 
+{
+  int *bitmatrix;
+  int rowelts, rowindex, colindex, elt, i, j, l, x;
+
+  bitmatrix = talloc(int, k*m*w*w);
+  if (matrix == NULL) { return NULL; }
+
+  rowelts = k * w;
+  rowindex = 0;
+
+  for (i = 0; i < m; i++) {
+    colindex = rowindex;
+    for (j = 0; j < k; j++) {
+      elt = matrix[i*k+j];
+      for (x = 0; x < w; x++) {
+        for (l = 0; l < w; l++) {
+          bitmatrix[colindex+x+l*rowelts] = ((elt & (1 << l)) ? 1 : 0);
+        }
+        elt = galois_single_multiply(elt, 2, w);
+      }
+      colindex += w;
+    }
+    rowindex += rowelts * w;
+  }
+  return bitmatrix;
+}
+
+void jerasure_matrix_encode(int k, int m, int w, int *matrix,
+                          char **data_ptrs, char **coding_ptrs, int size)
+{
+  int i;
+  
+  if (w != 8 && w != 16 && w != 32) {
+    fprintf(stderr, "ERROR: jerasure_matrix_encode() and w is not 8, 16 or 32\n");
+    assert(0);
+  }
+
+  for (i = 0; i < m; i++) {
+    jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, k+i, data_ptrs, coding_ptrs, size);
+  }
+}
+
+void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row,
+                             int *src_ids, int dest_id,
+                             char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+  int j, sindex, pstarted, index, x, y;
+  char *dptr, *pptr, *bdptr, *bpptr;
+
+  if (size%(w*packetsize) != 0) {
+    fprintf(stderr, "jerasure_bitmatrix_dotprod - size%c(w*packetsize)) must = 0\n", '%');
+    assert(0);
+  }
+
+  bpptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k];
+
+  for (sindex = 0; sindex < size; sindex += (packetsize*w)) {
+    index = 0;
+    for (j = 0; j < w; j++) {
+      pstarted = 0;
+      pptr = bpptr + sindex + j*packetsize;
+      for (x = 0; x < k; x++) {
+        if (src_ids == NULL) {
+          bdptr = data_ptrs[x];
+        } else if (src_ids[x] < k) {
+          bdptr = data_ptrs[src_ids[x]];
+        } else {
+          bdptr = coding_ptrs[src_ids[x]-k];
+        }
+        for (y = 0; y < w; y++) {
+          if (bitmatrix_row[index]) {
+            dptr = bdptr + sindex + y*packetsize;
+            if (!pstarted) {
+              memcpy(pptr, dptr, packetsize);
+              jerasure_total_memcpy_bytes += packetsize;
+              pstarted = 1;
+            } else {
+              galois_region_xor(dptr, pptr, packetsize);
+              jerasure_total_xor_bytes += packetsize;
+            }
+          }
+          index++;
+        }
+      }
+    }
+  }
+}
+
+void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size) 
+{
+  int i;
+
+  memcpy(parity_ptr, data_ptrs[0], size);
+  jerasure_total_memcpy_bytes += size;
+  
+  for (i = 1; i < k; i++) {
+    galois_region_xor(data_ptrs[i], parity_ptr, size);
+    jerasure_total_xor_bytes += size;
+  }
+}
+
+int jerasure_invert_matrix(int *mat, int *inv, int rows, int w)
+{
+  int cols, i, j, k, x, rs2;
+  int row_start, tmp, inverse;
+ 
+  cols = rows;
+
+  k = 0;
+  for (i = 0; i < rows; i++) {
+    for (j = 0; j < cols; j++) {
+      inv[k] = (i == j) ? 1 : 0;
+      k++;
+    }
+  }
+
+  /* First -- convert into upper triangular  */
+  for (i = 0; i < cols; i++) {
+    row_start = cols*i;
+
+    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the 
+       matrix was not invertible  */
+
+    if (mat[row_start+i] == 0) { 
+      for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ;
+      if (j == rows) return -1;
+      rs2 = j*cols;
+      for (k = 0; k < cols; k++) {
+        tmp = mat[row_start+k];
+        mat[row_start+k] = mat[rs2+k];
+        mat[rs2+k] = tmp;
+        tmp = inv[row_start+k];
+        inv[row_start+k] = inv[rs2+k];
+        inv[rs2+k] = tmp;
+      }
+    }
+ 
+    /* Multiply the row by 1/element i,i  */
+    tmp = mat[row_start+i];
+    if (tmp != 1) {
+      inverse = galois_single_divide(1, tmp, w);
+      for (j = 0; j < cols; j++) { 
+        mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w);
+        inv[row_start+j] = galois_single_multiply(inv[row_start+j], inverse, w);
+      }
+    }
+
+    /* Now for each j>i, add A_ji*Ai to Aj  */
+    k = row_start+i;
+    for (j = i+1; j != cols; j++) {
+      k += cols;
+      if (mat[k] != 0) {
+        if (mat[k] == 1) {
+          rs2 = cols*j;
+          for (x = 0; x < cols; x++) {
+            mat[rs2+x] ^= mat[row_start+x];
+            inv[rs2+x] ^= inv[row_start+x];
+          }
+        } else {
+          tmp = mat[k];
+          rs2 = cols*j;
+          for (x = 0; x < cols; x++) {
+            mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w);
+            inv[rs2+x] ^= galois_single_multiply(tmp, inv[row_start+x], w);
+          }
+        }
+      }
+    }
+  }
+
+  /* Now the matrix is upper triangular.  Start at the top and multiply down  */
+
+  for (i = rows-1; i >= 0; i--) {
+    row_start = i*cols;
+    for (j = 0; j < i; j++) {
+      rs2 = j*cols;
+      if (mat[rs2+i] != 0) {
+        tmp = mat[rs2+i];
+        mat[rs2+i] = 0; 
+        for (k = 0; k < cols; k++) {
+          inv[rs2+k] ^= galois_single_multiply(tmp, inv[row_start+k], w);
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+int jerasure_invertible_matrix(int *mat, int rows, int w)
+{
+  int cols, i, j, k, x, rs2;
+  int row_start, tmp, inverse;
+ 
+  cols = rows;
+
+  /* First -- convert into upper triangular  */
+  for (i = 0; i < cols; i++) {
+    row_start = cols*i;
+
+    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the 
+       matrix was not invertible  */
+
+    if (mat[row_start+i] == 0) { 
+      for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ;
+      if (j == rows) return 0;
+      rs2 = j*cols;
+      for (k = 0; k < cols; k++) {
+        tmp = mat[row_start+k];
+        mat[row_start+k] = mat[rs2+k];
+        mat[rs2+k] = tmp;
+      }
+    }
+ 
+    /* Multiply the row by 1/element i,i  */
+    tmp = mat[row_start+i];
+    if (tmp != 1) {
+      inverse = galois_single_divide(1, tmp, w);
+      for (j = 0; j < cols; j++) { 
+        mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w);
+      }
+    }
+
+    /* Now for each j>i, add A_ji*Ai to Aj  */
+    k = row_start+i;
+    for (j = i+1; j != cols; j++) {
+      k += cols;
+      if (mat[k] != 0) {
+        if (mat[k] == 1) {
+          rs2 = cols*j;
+          for (x = 0; x < cols; x++) {
+            mat[rs2+x] ^= mat[row_start+x];
+          }
+        } else {
+          tmp = mat[k];
+          rs2 = cols*j;
+          for (x = 0; x < cols; x++) {
+            mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w);
+          }
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+/* Converts a list-style version of the erasures into an array of k+m elements
+   where the element = 1 if the index has been erased, and zero otherwise */
+
+int *jerasure_erasures_to_erased(int k, int m, int *erasures)
+{
+  int td;
+  int t_non_erased;
+  int *erased;
+  int i;
+
+  td = k+m;
+  erased = talloc(int, td);
+  if (erased == NULL) return NULL;
+  t_non_erased = td;
+
+  for (i = 0; i < td; i++) erased[i] = 0;
+
+  for (i = 0; erasures[i] != -1; i++) {
+    if (erased[erasures[i]] == 0) {
+      erased[erasures[i]] = 1;
+      t_non_erased--;
+      if (t_non_erased < k) {
+        free(erased);
+        return NULL;
+      }
+    }
+  }
+  return erased;
+}
+  
+void jerasure_free_schedule(int **schedule)
+{
+  int i;
+
+  for (i = 0; schedule[i][0] >= 0; i++) free(schedule[i]);
+  free(schedule[i]);
+  free(schedule);
+}
+
+void jerasure_free_schedule_cache(int k, int m, int ***cache)
+{
+  int e1, e2;
+
+  if (m != 2) {
+    fprintf(stderr, "jerasure_free_schedule_cache(): m must equal 2\n");
+    assert(0);
+  }
+
+  for (e1 = 0; e1 < k+m; e1++) {
+    for (e2 = 0; e2 < e1; e2++) {
+      jerasure_free_schedule(cache[e1*(k+m)+e2]);
+    }
+    jerasure_free_schedule(cache[e1*(k+m)+e1]);
+  }
+  free(cache);
+}
+
+void jerasure_matrix_dotprod(int k, int w, int *matrix_row,
+                          int *src_ids, int dest_id,
+                          char **data_ptrs, char **coding_ptrs, int size)
+{
+  int init;
+  char *dptr, *sptr;
+  int i;
+
+  if (w != 1 && w != 8 && w != 16 && w != 32) {
+    fprintf(stderr, "ERROR: jerasure_matrix_dotprod() called and w is not 1, 8, 16 or 32\n");
+    assert(0);
+  }
+
+  init = 0;
+
+  dptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k];
+
+  /* First copy or xor any data that does not need to be multiplied by a factor */
+
+  for (i = 0; i < k; i++) {
+    if (matrix_row[i] == 1) {
+      if (src_ids == NULL) {
+        sptr = data_ptrs[i];
+      } else if (src_ids[i] < k) {
+        sptr = data_ptrs[src_ids[i]];
+      } else {
+        sptr = coding_ptrs[src_ids[i]-k];
+      }
+      if (init == 0) {
+        memcpy(dptr, sptr, size);
+        jerasure_total_memcpy_bytes += size;
+        init = 1;
+      } else {
+        galois_region_xor(sptr, dptr, size);
+        jerasure_total_xor_bytes += size;
+      }
+    }
+  }
+
+  /* Now do the data that needs to be multiplied by a factor */
+
+  for (i = 0; i < k; i++) {
+    if (matrix_row[i] != 0 && matrix_row[i] != 1) {
+      if (src_ids == NULL) {
+        sptr = data_ptrs[i];
+      } else if (src_ids[i] < k) {
+        sptr = data_ptrs[src_ids[i]];
+      } else {
+        sptr = coding_ptrs[src_ids[i]-k];
+      }
+      switch (w) {
+        case 8:  galois_w08_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
+        case 16: galois_w16_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
+        case 32: galois_w32_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
+      }
+      jerasure_total_gf_bytes += size;
+      init = 1;
+    }
+  }
+}
+
+
+int jerasure_bitmatrix_decode(int k, int m, int w, int *bitmatrix, int row_k_ones, int *erasures,
+                            char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+  int i;
+  int *erased;
+  int *decoding_matrix;
+  int *dm_ids;
+  int edd, *tmpids, lastdrive;
+  
+  erased = jerasure_erasures_to_erased(k, m, erasures);
+  if (erased == NULL) return -1;
+
+  /* See jerasure_matrix_decode for the logic of this routine.  This one works just like
+     it, but calls the bitmatrix ops instead */
+
+  lastdrive = k;
+    
+  edd = 0;
+  for (i = 0; i < k; i++) {
+    if (erased[i]) {
+      edd++;
+      lastdrive = i;
+    } 
+  }
+
+  if (row_k_ones != 1 || erased[k]) lastdrive = k;
+  
+  dm_ids = NULL;
+  decoding_matrix = NULL;
+  
+  if (edd > 1 || (edd > 0 && (row_k_ones != 1 || erased[k]))) {
+
+    dm_ids = talloc(int, k);
+    if (dm_ids == NULL) {
+      free(erased);
+      return -1;
+    }
+  
+    decoding_matrix = talloc(int, k*k*w*w);
+    if (decoding_matrix == NULL) {
+      free(erased);
+      free(dm_ids);
+      return -1;
+    }
+  
+    if (jerasure_make_decoding_bitmatrix(k, m, w, bitmatrix, erased, decoding_matrix, dm_ids) < 0) {
+      free(erased);
+      free(dm_ids);
+      free(decoding_matrix);
+      return -1;
+    }
+  }
+
+  for (i = 0; edd > 0 && i < lastdrive; i++) {
+    if (erased[i]) {
+      jerasure_bitmatrix_dotprod(k, w, decoding_matrix+i*k*w*w, dm_ids, i, data_ptrs, coding_ptrs, size, packetsize);
+      edd--;
+    }
+  }
+
+  if (edd > 0) {
+    tmpids = talloc(int, k);
+    for (i = 0; i < k; i++) {
+      tmpids[i] = (i < lastdrive) ? i : i+1;
+    }
+    jerasure_bitmatrix_dotprod(k, w, bitmatrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size, packetsize);
+    free(tmpids);
+  }
+
+  for (i = 0; i < m; i++) {
+    if (erased[k+i]) {
+      jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize);
+    }
+  }
+
+  free(erased);
+  if (dm_ids != NULL) free(dm_ids);
+  if (decoding_matrix != NULL) free(decoding_matrix);
+
+  return 0;
+}
+
+static char **set_up_ptrs_for_scheduled_decoding(int k, int m, int *erasures, char **data_ptrs, char **coding_ptrs)
+{
+  int ddf, cdf;
+  int *erased;
+  char **ptrs;
+  int i, j, x;
+
+  ddf = 0;
+  cdf = 0;
+  for (i = 0; erasures[i] != -1; i++) {
+    if (erasures[i] < k) ddf++; else cdf++;
+  }
+  
+  erased = jerasure_erasures_to_erased(k, m, erasures);
+  if (erased == NULL) return NULL;
+
+  /* Set up ptrs.  It will be as follows:
+
+       - If data drive i has not failed, then ptrs[i] = data_ptrs[i].
+       - If data drive i has failed, then ptrs[i] = coding_ptrs[j], where j is the 
+            lowest unused non-failed coding drive.
+       - Elements k to k+ddf-1 are data_ptrs[] of the failed data drives.
+       - Elements k+ddf to k+ddf+cdf-1 are coding_ptrs[] of the failed data drives.
+
+       The array row_ids contains the ids of ptrs.
+       The array ind_to_row_ids contains the row_id of drive i.
+  
+       However, we're going to set row_ids and ind_to_row in a different procedure.
+   */
+         
+  ptrs = talloc(char *, k+m);
+
+  j = k;
+  x = k;
+  for (i = 0; i < k; i++) {
+    if (erased[i] == 0) {
+      ptrs[i] = data_ptrs[i];
+    } else {
+      while (erased[j]) j++;
+      ptrs[i] = coding_ptrs[j-k];
+      j++;
+      ptrs[x] = data_ptrs[i];
+      x++;
+    }
+  }
+  for (i = k; i < k+m; i++) {
+    if (erased[i]) {
+      ptrs[x] = coding_ptrs[i-k];
+      x++;
+    }
+  }
+  free(erased);
+  return ptrs;
+}
+
+static int set_up_ids_for_scheduled_decoding(int k, int m, int *erasures, int *row_ids, int *ind_to_row)
+{
+  int ddf, cdf;
+  int *erased;
+  int i, j, x;
+
+  ddf = 0;
+  cdf = 0;
+  for (i = 0; erasures[i] != -1; i++) {
+    if (erasures[i] < k) ddf++; else cdf++;
+  }
+  
+  erased = jerasure_erasures_to_erased(k, m, erasures);
+  if (erased == NULL) return -1;
+
+  /* See set_up_ptrs_for_scheduled_decoding for how these are set */
+
+  j = k;
+  x = k;
+  for (i = 0; i < k; i++) {
+    if (erased[i] == 0) {
+      row_ids[i] = i;
+      ind_to_row[i] = i;
+    } else {
+      while (erased[j]) j++;
+      row_ids[i] = j;
+      ind_to_row[j] = i;
+      j++;
+      row_ids[x] = i;
+      ind_to_row[i] = x;
+      x++;
+    }
+  }
+  for (i = k; i < k+m; i++) {
+    if (erased[i]) {
+      row_ids[x] = i;
+      ind_to_row[i] = x;
+      x++;
+    }
+  }
+  free(erased);
+  return 0;
+}
+
+static int **jerasure_generate_decoding_schedule(int k, int m, int w, int *bitmatrix, int *erasures, int smart)
+{
+  int i, j, x, drive, y, index, z;
+  int *decoding_matrix, *inverse, *real_decoding_matrix;
+  int *ptr;
+  int *row_ids;
+  int *ind_to_row;
+  int ddf, cdf;
+  int **schedule;
+  int *b1, *b2;
+ 
+ /* First, figure out the number of data drives that have failed, and the
+    number of coding drives that have failed: ddf and cdf */
+
+  ddf = 0;
+  cdf = 0;
+  for (i = 0; erasures[i] != -1; i++) {
+    if (erasures[i] < k) ddf++; else cdf++;
+  }
+  
+  row_ids = talloc(int, k+m);
+  ind_to_row = talloc(int, k+m);
+
+  if (set_up_ids_for_scheduled_decoding(k, m, erasures, row_ids, ind_to_row) < 0) return NULL;
+
+  /* Now, we're going to create one decoding matrix which is going to 
+     decode everything with one call.  The hope is that the scheduler
+     will do a good job.    This matrix has w*e rows, where e is the
+     number of erasures (ddf+cdf) */
+
+  real_decoding_matrix = talloc(int, k*w*(cdf+ddf)*w);
+
+  /* First, if any data drives have failed, then initialize the first
+     ddf*w rows of the decoding matrix from the standard decoding
+     matrix inversion */
+
+  if (ddf > 0) {
+    
+    decoding_matrix = talloc(int, k*k*w*w);
+    ptr = decoding_matrix;
+    for (i = 0; i < k; i++) {
+      if (row_ids[i] == i) {
+        bzero(ptr, k*w*w*sizeof(int));
+        for (x = 0; x < w; x++) {
+          ptr[x+i*w+x*k*w] = 1;
+        } 
+      } else {
+        memcpy(ptr, bitmatrix+k*w*w*(row_ids[i]-k), k*w*w*sizeof(int));
+      }
+      ptr += (k*w*w);
+    }
+    inverse = talloc(int, k*k*w*w);
+    jerasure_invert_bitmatrix(decoding_matrix, inverse, k*w);
+
+/*    printf("\nMatrix to invert\n");
+    jerasure_print_bitmatrix(decoding_matrix, k*w, k*w, w);
+    printf("\n");
+    printf("\nInverse\n");
+    jerasure_print_bitmatrix(inverse, k*w, k*w, w);
+    printf("\n"); */
+
+    free(decoding_matrix);
+    ptr = real_decoding_matrix;
+    for (i = 0; i < ddf; i++) {
+      memcpy(ptr, inverse+k*w*w*row_ids[k+i], sizeof(int)*k*w*w);
+      ptr += (k*w*w);
+    }
+    free(inverse);
+  } 
+
+  /* Next, here comes the hard part.  For each coding node that needs
+     to be decoded, you start by putting its rows of the distribution
+     matrix into the decoding matrix.  If there were no failed data
+     nodes, then you're done.  However, if there have been failed
+     data nodes, then you need to modify the columns that correspond
+     to the data nodes.  You do that by first zeroing them.  Then
+     whereever there is a one in the distribution matrix, you XOR
+     in the corresponding row from the failed data node's entry in
+     the decoding matrix.  The whole process kind of makes my head
+     spin, but it works.
+   */
+
+  for (x = 0; x < cdf; x++) {
+    drive = row_ids[x+ddf+k]-k;
+    ptr = real_decoding_matrix + k*w*w*(ddf+x);
+    memcpy(ptr, bitmatrix+drive*k*w*w, sizeof(int)*k*w*w);
+
+    for (i = 0; i < k; i++) {
+      if (row_ids[i] != i) {
+        for (j = 0; j < w; j++) {
+          bzero(ptr+j*k*w+i*w, sizeof(int)*w);
+        }
+      }  
+    }
+
+    /* There's the yucky part */
+
+    index = drive*k*w*w;
+    for (i = 0; i < k; i++) {
+      if (row_ids[i] != i) {
+        b1 = real_decoding_matrix+(ind_to_row[i]-k)*k*w*w;
+        for (j = 0; j < w; j++) {
+          b2 = ptr + j*k*w;
+          for (y = 0; y < w; y++) {
+            if (bitmatrix[index+j*k*w+i*w+y]) {
+              for (z = 0; z < k*w; z++) {
+                b2[z] = b2[z] ^ b1[z+y*k*w];
+              }
+            }
+          }
+        }
+      }  
+    }
+  }
+
+/*
+  printf("\n\nReal Decoding Matrix\n\n");
+  jerasure_print_bitmatrix(real_decoding_matrix, (ddf+cdf)*w, k*w, w);
+  printf("\n"); */
+  if (smart) {
+    schedule = jerasure_smart_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix);
+  } else {
+    schedule = jerasure_dumb_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix);
+  }
+  free(row_ids);
+  free(ind_to_row);
+  free(real_decoding_matrix);
+  return schedule;
+}
+
+int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures,
+                            char **data_ptrs, char **coding_ptrs, int size, int packetsize, 
+                            int smart)
+{
+  int i, tdone;
+  char **ptrs;
+  int **schedule;
+ 
+  ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs);
+  if (ptrs == NULL) return -1;
+
+  schedule = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
+  if (schedule == NULL) {
+    free(ptrs);
+    return -1;
+  }
+
+  for (tdone = 0; tdone < size; tdone += packetsize*w) {
+  jerasure_do_scheduled_operations(ptrs, schedule, packetsize);
+    for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w);
+  }
+
+  jerasure_free_schedule(schedule);
+  free(ptrs);
+
+  return 0;
+}
+
+int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures,
+                            char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+  int i, tdone;
+  char **ptrs;
+  int **schedule;
+  int index;
+ 
+  if (erasures[1] == -1) {
+    index = erasures[0]*(k+m) + erasures[0];
+  } else if (erasures[2] == -1) {
+    index = erasures[0]*(k+m) + erasures[1];
+  } else {
+    return -1;
+  }
+
+  schedule = scache[index];
+
+  ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs);
+  if (ptrs == NULL) return -1;
+
+
+  for (tdone = 0; tdone < size; tdone += packetsize*w) {
+  jerasure_do_scheduled_operations(ptrs, schedule, packetsize);
+    for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w);
+  }
+
+  free(ptrs);
+
+  return 0;
+}
+
+/* This only works when m = 2 */
+
+int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart)
+{
+  int ***scache;
+  int erasures[3];
+  int e1, e2;
+ 
+  /* Ok -- this is yucky, but it's how I'm doing it.  You will make an index out
+     of erasures, which will be  e1*(k+m)+(e2).  If there is no e2, then e2 = e1.
+     Isn't that clever and confusing.  Sorry.
+
+     We're not going to worry about ordering -- in other words, the schedule for
+     e1,e2 will be the same as e2,e1.  They will have the same pointer -- the 
+     schedule will not be duplicated. */
+
+  if (m != 2) return NULL;
+
+  scache = talloc(int **, (k+m)*(k+m+1));
+  if (scache == NULL) return NULL;
+  
+  for (e1 = 0; e1 < k+m; e1++) {
+    erasures[0] = e1;
+    for (e2 = 0; e2 < e1; e2++) {
+      erasures[1] = e2;
+      erasures[2] = -1;
+      scache[e1*(k+m)+e2] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
+      scache[e2*(k+m)+e1] = scache[e1*(k+m)+e2];
+    }
+    erasures[1] = -1;
+    scache[e1*(k+m)+e1] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
+  }
+  return scache;
+
+}
+
+int jerasure_invert_bitmatrix(int *mat, int *inv, int rows)
+{
+  int cols, i, j, k;
+  int tmp;
+ 
+  cols = rows;
+
+  k = 0;
+  for (i = 0; i < rows; i++) {
+    for (j = 0; j < cols; j++) {
+      inv[k] = (i == j) ? 1 : 0;
+      k++;
+    }
+  }
+
+  /* First -- convert into upper triangular */
+
+  for (i = 0; i < cols; i++) {
+
+    /* Swap rows if we have a zero i,i element.  If we can't swap, then the 
+       matrix was not invertible */
+
+    if ((mat[i*cols+i]) == 0) { 
+      for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ;
+      if (j == rows) return -1;
+      for (k = 0; k < cols; k++) {
+        tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp;
+        tmp = inv[i*cols+k]; inv[i*cols+k] = inv[j*cols+k]; inv[j*cols+k] = tmp;
+      }
+    }
+ 
+    /* Now for each j>i, add A_ji*Ai to Aj */
+    for (j = i+1; j != rows; j++) {
+      if (mat[j*cols+i] != 0) {
+        for (k = 0; k < cols; k++) {
+          mat[j*cols+k] ^= mat[i*cols+k]; 
+          inv[j*cols+k] ^= inv[i*cols+k];
+        }
+      }
+    }
+  }
+
+  /* Now the matrix is upper triangular.  Start at the top and multiply down */
+
+  for (i = rows-1; i >= 0; i--) {
+    for (j = 0; j < i; j++) {
+      if (mat[j*cols+i]) {
+        for (k = 0; k < cols; k++) {
+          mat[j*cols+k] ^= mat[i*cols+k]; 
+          inv[j*cols+k] ^= inv[i*cols+k];
+        }
+      }
+    }
+  } 
+  return 0;
+}
+
+int jerasure_invertible_bitmatrix(int *mat, int rows)
+{
+  int cols, i, j, k;
+  int tmp;
+ 
+  cols = rows;
+
+  /* First -- convert into upper triangular */
+
+  for (i = 0; i < cols; i++) {
+
+    /* Swap rows if we have a zero i,i element.  If we can't swap, then the 
+       matrix was not invertible */
+
+    if ((mat[i*cols+i]) == 0) { 
+      for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ;
+      if (j == rows) return 0;
+      for (k = 0; k < cols; k++) {
+        tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp;
+      }
+    }
+ 
+    /* Now for each j>i, add A_ji*Ai to Aj */
+    for (j = i+1; j != rows; j++) {
+      if (mat[j*cols+i] != 0) {
+        for (k = 0; k < cols; k++) {
+          mat[j*cols+k] ^= mat[i*cols+k]; 
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+  
+int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w)
+{
+  int *product, i, j, k;
+
+  product = (int *) malloc(sizeof(int)*r1*c2);
+  for (i = 0; i < r1*c2; i++) product[i] = 0;
+
+  for (i = 0; i < r1; i++) {
+    for (j = 0; j < c2; j++) {
+      for (k = 0; k < r2; k++) {
+        product[i*c2+j] ^= galois_single_multiply(m1[i*c1+k], m2[k*c2+j], w);
+      }
+    }
+  }
+  return product;
+}
+
+void jerasure_get_stats(double *fill_in)
+{
+  fill_in[0] = jerasure_total_xor_bytes;
+  fill_in[1] = jerasure_total_gf_bytes;
+  fill_in[2] = jerasure_total_memcpy_bytes;
+  jerasure_total_xor_bytes = 0;
+  jerasure_total_gf_bytes = 0;
+  jerasure_total_memcpy_bytes = 0;
+}
+
+void jerasure_do_scheduled_operations(char **ptrs, int **operations, int packetsize)
+{
+  char *sptr;
+  char *dptr;
+  int op;
+
+  for (op = 0; operations[op][0] >= 0; op++) {
+    sptr = ptrs[operations[op][0]] + operations[op][1]*packetsize;
+    dptr = ptrs[operations[op][2]] + operations[op][3]*packetsize;
+    if (operations[op][4]) {
+/*      printf("%d,%d %d,%d\n", operations[op][0], 
+      operations[op][1], 
+      operations[op][2], 
+      operations[op][3]); 
+      printf("xor(0x%x, 0x%x -> 0x%x, %d)\n", sptr, dptr, dptr, packetsize); */
+      galois_region_xor(sptr, dptr, packetsize);
+      jerasure_total_xor_bytes += packetsize;
+    } else {
+/*      printf("memcpy(0x%x <- 0x%x)\n", dptr, sptr); */
+      memcpy(dptr, sptr, packetsize);
+      jerasure_total_memcpy_bytes += packetsize;
+    }
+  }  
+}
+
+void jerasure_schedule_encode(int k, int m, int w, int **schedule,
+                                   char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+  char **ptr_copy;
+  int i, tdone;
+
+  ptr_copy = talloc(char *, (k+m));
+  for (i = 0; i < k; i++) ptr_copy[i] = data_ptrs[i];
+  for (i = 0; i < m; i++) ptr_copy[i+k] = coding_ptrs[i];
+  for (tdone = 0; tdone < size; tdone += packetsize*w) {
+    jerasure_do_scheduled_operations(ptr_copy, schedule, packetsize);
+    for (i = 0; i < k+m; i++) ptr_copy[i] += (packetsize*w);
+  }
+  free(ptr_copy);
+}
+    
+int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix)
+{
+  int **operations;
+  int op;
+  int index, optodo, i, j;
+
+  operations = talloc(int *, k*m*w*w+1);
+  op = 0;
+  
+  index = 0;
+  for (i = 0; i < m*w; i++) {
+    optodo = 0;
+    for (j = 0; j < k*w; j++) {
+      if (bitmatrix[index]) {
+        operations[op] = talloc(int, 5);
+        operations[op][4] = optodo;
+        operations[op][0] = j/w;
+        operations[op][1] = j%w;
+        operations[op][2] = k+i/w;
+        operations[op][3] = i%w;
+        optodo = 1;
+        op++;
+        
+      }
+      index++;
+    }
+  }
+  operations[op] = talloc(int, 5);
+  operations[op][0] = -1;
+  return operations;
+}
+
+int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix)
+{
+  int **operations;
+  int op;
+  int i, j;
+  int *diff, *from, *b1, *flink, *blink;
+  int *ptr, no, row;
+  int optodo;
+  int bestrow = 0, bestdiff, top;
+
+/*   printf("Scheduling:\n\n");
+  jerasure_print_bitmatrix(bitmatrix, m*w, k*w, w); */
+
+  operations = talloc(int *, k*m*w*w+1);
+  op = 0;
+  
+  diff = talloc(int, m*w);
+  from = talloc(int, m*w);
+  flink = talloc(int, m*w);
+  blink = talloc(int, m*w);
+
+  ptr = bitmatrix;
+
+  bestdiff = k*w+1;
+  top = 0;
+  for (i = 0; i < m*w; i++) {
+    no = 0;
+    for (j = 0; j < k*w; j++) {
+      no += *ptr;
+      ptr++;
+    }
+    diff[i] = no;
+    from[i] = -1;
+    flink[i] = i+1;
+    blink[i] = i-1;
+    if (no < bestdiff) {
+      bestdiff = no;
+      bestrow = i;
+    }
+  }
+
+  flink[m*w-1] = -1;
+  
+  while (top != -1) {
+    row = bestrow;
+    /* printf("Doing row %d - %d from %d\n", row, diff[row], from[row]);  */
+
+    if (blink[row] == -1) {
+      top = flink[row];
+      if (top != -1) blink[top] = -1;
+    } else {
+      flink[blink[row]] = flink[row];
+      if (flink[row] != -1) {
+        blink[flink[row]] = blink[row];
+      }
+    }
+
+    ptr = bitmatrix + row*k*w;
+    if (from[row] == -1) {
+      optodo = 0;
+      for (j = 0; j < k*w; j++) {
+        if (ptr[j]) {
+          operations[op] = talloc(int, 5);
+          operations[op][4] = optodo;
+          operations[op][0] = j/w;
+          operations[op][1] = j%w;
+          operations[op][2] = k+row/w;
+          operations[op][3] = row%w;
+          optodo = 1;
+          op++;
+        }
+      }
+    } else {
+      operations[op] = talloc(int, 5);
+      operations[op][4] = 0;
+      operations[op][0] = k+from[row]/w;
+      operations[op][1] = from[row]%w;
+      operations[op][2] = k+row/w;
+      operations[op][3] = row%w;
+      op++;
+      b1 = bitmatrix + from[row]*k*w;
+      for (j = 0; j < k*w; j++) {
+        if (ptr[j] ^ b1[j]) {
+          operations[op] = talloc(int, 5);
+          operations[op][4] = 1;
+          operations[op][0] = j/w;
+          operations[op][1] = j%w;
+          operations[op][2] = k+row/w;
+          operations[op][3] = row%w;
+          optodo = 1;
+          op++;
+        }
+      }
+    }
+    bestdiff = k*w+1;
+    for (i = top; i != -1; i = flink[i]) {
+      no = 1;
+      b1 = bitmatrix + i*k*w;
+      for (j = 0; j < k*w; j++) no += (ptr[j] ^ b1[j]);
+      if (no < diff[i]) {
+        from[i] = row;
+        diff[i] = no;
+      }
+      if (diff[i] < bestdiff) {
+        bestdiff = diff[i];
+        bestrow = i;
+      }
+    }
+  }
+  
+  operations[op] = talloc(int, 5);
+  operations[op][0] = -1;
+  free(from);
+  free(diff);
+  free(blink);
+  free(flink);
+
+  return operations;
+}
+
+void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix,
+                            char **data_ptrs, char **coding_ptrs, int size, int packetsize)
+{
+  int i;
+
+  if (packetsize%sizeof(long) != 0) {
+    fprintf(stderr, "jerasure_bitmatrix_encode - packetsize(%d) %c sizeof(long) != 0\n", packetsize, '%');
+    assert(0);
+  }
+  if (size%(packetsize*w) != 0) {
+    fprintf(stderr, "jerasure_bitmatrix_encode - size(%d) %c (packetsize(%d)*w(%d))) != 0\n", 
+         size, '%', packetsize, w);
+    assert(0);
+  }
+
+  for (i = 0; i < m; i++) {
+    jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize);
+  }
+}
+
+/*
+ * Exported function for use by autoconf to perform quick 
+ * spot-check.
+ */
+int jerasure_autoconf_test()
+{
+  int x = galois_single_multiply(1, 2, 8);
+  if (x != 2) {
+    return -1;
+  }
+  return 0;
+}
+
diff --git a/src/erasure-code/jerasure/jerasure/src/liberation.c b/src/erasure-code/jerasure/jerasure/src/liberation.c
new file mode 100644
index 0000000..11a1c4f
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/src/liberation.c
@@ -0,0 +1,262 @@
+/* *
+ * Copyright (c) 2014, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Jerasure's authors:
+
+   Revision 2.x - 2014: James S. Plank and Kevin M. Greenan
+   Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman.
+   Revision 1.0 - 2007: James S. Plank
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "galois.h"
+#include "jerasure.h"
+#include "liberation.h"
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+int *liberation_coding_bitmatrix(int k, int w)
+{
+  int *matrix, i, j, index;
+
+  if (k > w) return NULL;
+  matrix = talloc(int, 2*k*w*w);
+  if (matrix == NULL) return NULL;
+  bzero(matrix, sizeof(int)*2*k*w*w);
+  
+  /* Set up identity matrices */
+
+  for(i = 0; i < w; i++) {
+    index = i*k*w+i;
+    for (j = 0; j < k; j++) {
+      matrix[index] = 1;
+      index += w;
+    }
+  }
+
+  /* Set up liberation matrices */
+
+  for (j = 0; j < k; j++) {
+    index = k*w*w+j*w;
+    for (i = 0; i < w; i++) {
+      matrix[index+(j+i)%w] = 1;
+      index += (k*w);
+    }
+    if (j > 0) {
+      i = (j*((w-1)/2))%w;
+      matrix[k*w*w+j*w+i*k*w+(i+j-1)%w] = 1;
+    }
+  }
+  return matrix;
+}
+  
+
+int *liber8tion_coding_bitmatrix(int k)
+{
+  int *matrix, i, j, index;
+  int w;
+
+  w = 8;
+  if (k > w) return NULL;
+  matrix = talloc(int, 2*k*w*w);
+  if (matrix == NULL) return NULL;
+  bzero(matrix, sizeof(int)*2*k*w*w);
+  
+  /* Set up identity matrices */
+
+  for(i = 0; i < w; i++) {
+    index = i*k*w+i;
+    for (j = 0; j < k; j++) {
+      matrix[index] = 1;
+      index += w;
+    }
+  }
+
+  /* Set up liber8tion matrices */
+
+  index = k*w*w;
+
+  if (k == 0) return matrix;
+  matrix[index+0*k*w+0*w+0] = 1;
+  matrix[index+1*k*w+0*w+1] = 1;
+  matrix[index+2*k*w+0*w+2] = 1;
+  matrix[index+3*k*w+0*w+3] = 1;
+  matrix[index+4*k*w+0*w+4] = 1;
+  matrix[index+5*k*w+0*w+5] = 1;
+  matrix[index+6*k*w+0*w+6] = 1;
+  matrix[index+7*k*w+0*w+7] = 1;
+
+  if (k == 1) return matrix;
+  matrix[index+0*k*w+1*w+7] = 1;
+  matrix[index+1*k*w+1*w+3] = 1;
+  matrix[index+2*k*w+1*w+0] = 1;
+  matrix[index+3*k*w+1*w+2] = 1;
+  matrix[index+4*k*w+1*w+6] = 1;
+  matrix[index+5*k*w+1*w+1] = 1;
+  matrix[index+6*k*w+1*w+5] = 1;
+  matrix[index+7*k*w+1*w+4] = 1;
+  matrix[index+4*k*w+1*w+7] = 1;
+
+  if (k == 2) return matrix;
+  matrix[index+0*k*w+2*w+6] = 1;
+  matrix[index+1*k*w+2*w+2] = 1;
+  matrix[index+2*k*w+2*w+4] = 1;
+  matrix[index+3*k*w+2*w+0] = 1;
+  matrix[index+4*k*w+2*w+7] = 1;
+  matrix[index+5*k*w+2*w+3] = 1;
+  matrix[index+6*k*w+2*w+1] = 1;
+  matrix[index+7*k*w+2*w+5] = 1;
+  matrix[index+1*k*w+2*w+3] = 1;
+
+  if (k == 3) return matrix;
+  matrix[index+0*k*w+3*w+2] = 1;
+  matrix[index+1*k*w+3*w+5] = 1;
+  matrix[index+2*k*w+3*w+7] = 1;
+  matrix[index+3*k*w+3*w+6] = 1;
+  matrix[index+4*k*w+3*w+0] = 1;
+  matrix[index+5*k*w+3*w+3] = 1;
+  matrix[index+6*k*w+3*w+4] = 1;
+  matrix[index+7*k*w+3*w+1] = 1;
+  matrix[index+5*k*w+3*w+4] = 1;
+
+  if (k == 4) return matrix;
+  matrix[index+0*k*w+4*w+5] = 1;
+  matrix[index+1*k*w+4*w+6] = 1;
+  matrix[index+2*k*w+4*w+1] = 1;
+  matrix[index+3*k*w+4*w+7] = 1;
+  matrix[index+4*k*w+4*w+2] = 1;
+  matrix[index+5*k*w+4*w+4] = 1;
+  matrix[index+6*k*w+4*w+3] = 1;
+  matrix[index+7*k*w+4*w+0] = 1;
+  matrix[index+2*k*w+4*w+0] = 1;
+
+  if (k == 5) return matrix;
+  matrix[index+0*k*w+5*w+1] = 1;
+  matrix[index+1*k*w+5*w+2] = 1;
+  matrix[index+2*k*w+5*w+3] = 1;
+  matrix[index+3*k*w+5*w+4] = 1;
+  matrix[index+4*k*w+5*w+5] = 1;
+  matrix[index+5*k*w+5*w+6] = 1;
+  matrix[index+6*k*w+5*w+7] = 1;
+  matrix[index+7*k*w+5*w+0] = 1;
+  matrix[index+7*k*w+5*w+2] = 1;
+
+  if (k == 6) return matrix;
+  matrix[index+0*k*w+6*w+3] = 1;
+  matrix[index+1*k*w+6*w+0] = 1;
+  matrix[index+2*k*w+6*w+6] = 1;
+  matrix[index+3*k*w+6*w+5] = 1;
+  matrix[index+4*k*w+6*w+1] = 1;
+  matrix[index+5*k*w+6*w+7] = 1;
+  matrix[index+6*k*w+6*w+4] = 1;
+  matrix[index+7*k*w+6*w+2] = 1;
+  matrix[index+6*k*w+6*w+5] = 1;
+
+  if (k == 7) return matrix;
+  matrix[index+0*k*w+7*w+4] = 1;
+  matrix[index+1*k*w+7*w+7] = 1;
+  matrix[index+2*k*w+7*w+1] = 1;
+  matrix[index+3*k*w+7*w+5] = 1;
+  matrix[index+4*k*w+7*w+3] = 1;
+  matrix[index+5*k*w+7*w+2] = 1;
+  matrix[index+6*k*w+7*w+0] = 1;
+  matrix[index+7*k*w+7*w+6] = 1;
+  matrix[index+3*k*w+7*w+1] = 1;
+
+  return matrix;
+}
+  
+int *blaum_roth_coding_bitmatrix(int k, int w)
+{
+  int *matrix, i, j, index, l, m, p;
+
+  if (k > w) return NULL ;
+
+  matrix = talloc(int, 2*k*w*w);
+  if (matrix == NULL) return NULL;
+  bzero(matrix, sizeof(int)*2*k*w*w);
+  
+  /* Set up identity matrices */
+
+  for(i = 0; i < w; i++) {
+    index = i*k*w+i;
+    for (j = 0; j < k; j++) {
+      matrix[index] = 1;
+      index += w;
+    }
+  }
+
+  /* Set up blaum_roth matrices -- Ignore identity */
+
+  p = w+1;
+  for (j = 0; j < k; j++) {
+    index = k*w*w+j*w;
+    if (j == 0) {
+      for (l = 0; l < w; l++) {
+        matrix[index+l] = 1;
+        index += k*w;
+      }
+    } else {
+      i = j;
+      for (l = 1; l <= w; l++) {
+        if (l != p-i) {
+          m = l+i;
+          if (m >= p) m -= p;
+          m--;
+          matrix[index+m] = 1;
+        } else {
+          matrix[index+i-1] = 1;
+          if (i%2 == 0) {
+            m = i/2;
+          } else {
+            m = (p/2) + 1 + (i/2);
+          }
+          m--;
+          matrix[index+m] = 1;
+        }
+        index += k*w;
+      }
+    }
+  }
+
+  return matrix;
+}
diff --git a/src/erasure-code/jerasure/jerasure/src/reed_sol.c b/src/erasure-code/jerasure/jerasure/src/reed_sol.c
new file mode 100644
index 0000000..82edacb
--- /dev/null
+++ b/src/erasure-code/jerasure/jerasure/src/reed_sol.c
@@ -0,0 +1,302 @@
+/* *
+ * Copyright (c) 2014, James S. Plank and Kevin Greenan
+ * All rights reserved.
+ *
+ * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
+ * Coding Techniques
+ *
+ * Revision 2.0: Galois Field backend now links to GF-Complete
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Jerasure's authors:
+
+   Revision 2.x - 2014: James S. Plank and Kevin M. Greenan
+   Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman.
+   Revision 1.0 - 2007: James S. Plank
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <gf_complete.h>
+#include "galois.h"
+#include "jerasure.h"
+#include "reed_sol.h"
+
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+int *reed_sol_r6_coding_matrix(int k, int w)
+{
+  int *matrix;
+  int i, tmp;
+
+  if (w != 8 && w != 16 && w != 32) return NULL;
+
+  matrix = talloc(int, 2*k);
+  if (matrix == NULL) return NULL;
+
+  for (i = 0; i < k; i++) matrix[i] = 1;
+  matrix[k] = 1;
+  tmp = 1;
+  for (i = 1; i < k; i++) {
+    tmp = galois_single_multiply(tmp, 2, w);
+    matrix[k+i] = tmp;
+  }
+  return matrix;
+}
+
+int *reed_sol_vandermonde_coding_matrix(int k, int m, int w)
+{
+  int i, j;
+  int *vdm, *dist;
+
+  vdm = reed_sol_big_vandermonde_distribution_matrix(k+m, k, w);
+  if (vdm == NULL) return NULL;
+  dist = talloc(int, m*k);
+  if (dist == NULL) {
+    free(vdm);
+    return NULL;
+  }
+
+  i = k*k;
+  for (j = 0; j < m*k; j++) {
+    dist[j] = vdm[i];
+    i++;
+  }
+  free(vdm);
+  return dist;
+}
+
+static int prim08 = -1;
+static gf_t GF08;
+
+void reed_sol_galois_w08_region_multby_2(char *region, int nbytes)
+{
+  if (prim08 == -1) {
+    prim08 = galois_single_multiply((1 << 7), 2, 8);
+    if (!gf_init_hard(&GF08, 8, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+                      prim08, 0, 0, NULL, NULL)) {
+      fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w08_region_multby_2\n");
+      assert(0);
+    }
+  }
+  GF08.multiply_region.w32(&GF08, region, region, 2, nbytes, 0);
+}
+
+static int prim16 = -1;
+static gf_t GF16;
+
+void reed_sol_galois_w16_region_multby_2(char *region, int nbytes)
+{
+  if (prim16 == -1) {
+    prim16 = galois_single_multiply((1 << 15), 2, 16);
+    if (!gf_init_hard(&GF16, 16, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+                      prim16, 0, 0, NULL, NULL)) {
+      fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w16_region_multby_2\n");
+      assert(0);
+    }
+  }
+  GF16.multiply_region.w32(&GF16, region, region, 2, nbytes, 0);
+}
+
+static int prim32 = -1;
+static gf_t GF32;
+
+void reed_sol_galois_w32_region_multby_2(char *region, int nbytes)
+{
+  if (prim32 == -1) {
+    prim32 = galois_single_multiply((1 << 31), 2, 32);
+    if (!gf_init_hard(&GF32, 32, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+                      prim32, 0, 0, NULL, NULL)) {
+      fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w32_region_multby_2\n");
+      assert(0);
+    }
+  }
+  GF32.multiply_region.w32(&GF32, region, region, 2, nbytes, 0);
+}
+
+int reed_sol_r6_encode(int k, int w, char **data_ptrs, char **coding_ptrs, int size)
+{
+  int i;
+
+  /* First, put the XOR into coding region 0 */
+
+  memcpy(coding_ptrs[0], data_ptrs[0], size);
+
+  for (i = 1; i < k; i++) galois_region_xor(data_ptrs[i], coding_ptrs[0], size);
+
+  /* Next, put the sum of (2^j)*Dj into coding region 1 */
+
+  memcpy(coding_ptrs[1], data_ptrs[k-1], size);
+
+  for (i = k-2; i >= 0; i--) {
+    switch (w) {
+      case 8:  reed_sol_galois_w08_region_multby_2(coding_ptrs[1], size); break;
+      case 16: reed_sol_galois_w16_region_multby_2(coding_ptrs[1], size); break;
+      case 32: reed_sol_galois_w32_region_multby_2(coding_ptrs[1], size); break;
+      default: return 0;
+    }
+
+    galois_region_xor(data_ptrs[i], coding_ptrs[1], size);
+  }
+  return 1;
+}
+
+int *reed_sol_extended_vandermonde_matrix(int rows, int cols, int w)
+{
+  int *vdm;
+  int i, j, k;
+
+  if (w < 30 && (1 << w) < rows) return NULL;
+  if (w < 30 && (1 << w) < cols) return NULL;
+
+  vdm = talloc(int, rows*cols);
+  if (vdm == NULL) { return NULL; }
+  
+  vdm[0] = 1;
+  for (j = 1; j < cols; j++) vdm[j] = 0;
+  if (rows == 1) return vdm;
+
+  i=(rows-1)*cols;
+  for (j = 0; j < cols-1; j++) vdm[i+j] = 0;
+  vdm[i+j] = 1;
+  if (rows == 2) return vdm;
+
+  for (i = 1; i < rows-1; i++) {
+    k = 1;
+    for (j = 0; j < cols; j++) {
+      vdm[i*cols+j] = k;
+      k = galois_single_multiply(k, i, w);
+    }
+  }
+  return vdm;
+}
+
+int *reed_sol_big_vandermonde_distribution_matrix(int rows, int cols, int w)
+{
+  int *dist;
+  int i, j, k;
+  int sindex, srindex, siindex, tmp;
+
+  if (cols >= rows) return NULL;
+  
+  dist = reed_sol_extended_vandermonde_matrix(rows, cols, w);
+  if (dist == NULL) return NULL;
+
+  sindex = 0;
+  for (i = 1; i < cols; i++) {
+    sindex += cols;
+
+    /* Find an appropriate row -- where i,i != 0 */
+    srindex = sindex+i;
+    for (j = i; j < rows && dist[srindex] == 0; j++) srindex += cols;
+    if (j >= rows) {   /* This should never happen if rows/w are correct */
+      fprintf(stderr, "reed_sol_big_vandermonde_distribution_matrix(%d,%d,%d) - couldn't make matrix\n", 
+             rows, cols, w);
+      assert(0);
+    }
+ 
+    /* If necessary, swap rows */
+    if (j != i) {
+      srindex -= i;
+      for (k = 0; k < cols; k++) {
+        tmp = dist[srindex+k];
+        dist[srindex+k] = dist[sindex+k];
+        dist[sindex+k] = tmp;
+      }
+    }
+  
+    /* If Element i,i is not equal to 1, multiply the column by 1/i */
+
+    if (dist[sindex+i] != 1) {
+      tmp = galois_single_divide(1, dist[sindex+i], w);
+      srindex = i;
+      for (j = 0; j < rows; j++) {
+        dist[srindex] = galois_single_multiply(tmp, dist[srindex], w);
+        srindex += cols;
+      }
+    }
+ 
+    /* Now, for each element in row i that is not in column 1, you need
+       to make it zero.  Suppose that this is column j, and the element
+       at i,j = e.  Then you want to replace all of column j with 
+       (col-j + col-i*e).   Note, that in row i, col-i = 1 and col-j = e.
+       So (e + 1e) = 0, which is indeed what we want. */
+
+    for (j = 0; j < cols; j++) {
+      tmp = dist[sindex+j];
+      if (j != i && tmp != 0) {
+        srindex = j;
+        siindex = i;
+        for (k = 0; k < rows; k++) {
+          dist[srindex] = dist[srindex] ^ galois_single_multiply(tmp, dist[siindex], w);
+          srindex += cols;
+          siindex += cols;
+        }
+      }
+    }
+  }
+  /* We desire to have row k be all ones.  To do that, multiply
+     the entire column j by 1/dist[k,j].  Then row j by 1/dist[j,j]. */
+
+  sindex = cols*cols;
+  for (j = 0; j < cols; j++) {
+    tmp = dist[sindex];
+    if (tmp != 1) { 
+      tmp = galois_single_divide(1, tmp, w);
+      srindex = sindex;
+      for (i = cols; i < rows; i++) {
+        dist[srindex] = galois_single_multiply(tmp, dist[srindex], w);
+        srindex += cols;
+      }
+    }
+    sindex++;
+  }
+
+  /* Finally, we'd like the first column of each row to be all ones.  To
+     do that, we multiply the row by the inverse of the first element. */
+
+  sindex = cols*(cols+1);
+  for (i = cols+1; i < rows; i++) {
+    tmp = dist[sindex];
+    if (tmp != 1) { 
+      tmp = galois_single_divide(1, tmp, w);
+      for (j = 0; j < cols; j++) dist[sindex+j] = galois_single_multiply(dist[sindex+j], tmp, w);
+    }
+    sindex += cols;
+  }
+
+  return dist;
+}
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git