[DRE-commits] [ruby-numo-narray] 01/05: Import Upstream version 0.9.0.7

Wed Jun 21 08:37:32 UTC 2017

This is an automated email from the git hooks/post-receive script.

uwabami-guest pushed a commit to branch master
in repository ruby-numo-narray.

commit b89f92d486d3439f732a5c6d325669d63a439c2e
Author: Youhei SASAKI <uwabami at gfd-dennou.org>
Date:   Wed Jun 21 16:57:22 2017 +0900

    Import Upstream version 0.9.0.7
---
 Gemfile                                        |    4 +
 README.md                                      |   60 +
 Rakefile                                       |   66 +
 ext/numo/narray/SFMT-params.h                  |   97 ++
 ext/numo/narray/SFMT-params19937.h             |   46 +
 ext/numo/narray/SFMT.c                         |  620 ++++++++
 ext/numo/narray/SFMT.h                         |  157 ++
 ext/numo/narray/array.c                        |  642 ++++++++
 ext/numo/narray/data.c                         |  966 ++++++++++++
 ext/numo/narray/depend.erb                     |   34 +
 ext/numo/narray/extconf.rb                     |   97 ++
 ext/numo/narray/gen/cogen.rb                   |   56 +
 ext/numo/narray/gen/def/bit.rb                 |   36 +
 ext/numo/narray/gen/def/dcomplex.rb            |   38 +
 ext/numo/narray/gen/def/dfloat.rb              |   36 +
 ext/numo/narray/gen/def/int16.rb               |   35 +
 ext/numo/narray/gen/def/int32.rb               |   35 +
 ext/numo/narray/gen/def/int64.rb               |   35 +
 ext/numo/narray/gen/def/int8.rb                |   35 +
 ext/numo/narray/gen/def/robject.rb             |   36 +
 ext/numo/narray/gen/def/scomplex.rb            |   38 +
 ext/numo/narray/gen/def/sfloat.rb              |   36 +
 ext/numo/narray/gen/def/uint16.rb              |   35 +
 ext/numo/narray/gen/def/uint32.rb              |   35 +
 ext/numo/narray/gen/def/uint64.rb              |   35 +
 ext/numo/narray/gen/def/uint8.rb               |   35 +
 ext/numo/narray/gen/erbpp2.rb                  |  325 ++++
 ext/numo/narray/gen/narray_def.rb              |  252 +++
 ext/numo/narray/gen/spec.rb                    |  396 +++++
 ext/numo/narray/gen/tmpl/accum.c               |   48 +
 ext/numo/narray/gen/tmpl/accum_binary.c        |   96 ++
 ext/numo/narray/gen/tmpl/accum_index.c         |   71 +
 ext/numo/narray/gen/tmpl/alloc_func.c          |  107 ++
 ext/numo/narray/gen/tmpl/allocate.c            |   35 +
 ext/numo/narray/gen/tmpl/aref.c                |   53 +
 ext/numo/narray/gen/tmpl/aset.c                |   65 +
 ext/numo/narray/gen/tmpl/binary.c              |   57 +
 ext/numo/narray/gen/tmpl/binary2.c             |   59 +
 ext/numo/narray/gen/tmpl/binary_s.c            |   34 +
 ext/numo/narray/gen/tmpl/bincount.c            |  180 +++
 ext/numo/narray/gen/tmpl/cast.c                |   44 +
 ext/numo/narray/gen/tmpl/cast_array.c          |   13 +
 ext/numo/narray/gen/tmpl/class.c               |    9 +
 ext/numo/narray/gen/tmpl/clip.c                |  118 ++
 ext/numo/narray/gen/tmpl/coerce_cast.c         |   10 +
 ext/numo/narray/gen/tmpl/cond_binary.c         |   55 +
 ext/numo/narray/gen/tmpl/cond_unary.c          |   45 +
 ext/numo/narray/gen/tmpl/cum.c                 |   49 +
 ext/numo/narray/gen/tmpl/each.c                |   43 +
 ext/numo/narray/gen/tmpl/each_with_index.c     |   64 +
 ext/numo/narray/gen/tmpl/extract.c             |   23 +
 ext/numo/narray/gen/tmpl/extract_data.c        |   48 +
 ext/numo/narray/gen/tmpl/eye.c                 |   91 ++
 ext/numo/narray/gen/tmpl/fill.c                |   38 +
 ext/numo/narray/gen/tmpl/format.c              |   60 +
 ext/numo/narray/gen/tmpl/format_to_a.c         |   47 +
 ext/numo/narray/gen/tmpl/frexp.c               |   37 +
 ext/numo/narray/gen/tmpl/init_class.c          |   20 +
 ext/numo/narray/gen/tmpl/init_module.c         |   12 +
 ext/numo/narray/gen/tmpl/inspect.c             |   20 +
 ext/numo/narray/gen/tmpl/lib.c                 |   45 +
 ext/numo/narray/gen/tmpl/logseq.c              |   82 +
 ext/numo/narray/gen/tmpl/map_with_index.c      |   94 ++
 ext/numo/narray/gen/tmpl/median.c              |   64 +
 ext/numo/narray/gen/tmpl/minmax.c              |   46 +
 ext/numo/narray/gen/tmpl/module.c              |    9 +
 ext/numo/narray/gen/tmpl/new_dim0.c            |   12 +
 ext/numo/narray/gen/tmpl/poly.c                |   49 +
 ext/numo/narray/gen/tmpl/pow.c                 |   78 +
 ext/numo/narray/gen/tmpl/powint.c              |   17 +
 ext/numo/narray/gen/tmpl/qsort.c               |  150 ++
 ext/numo/narray/gen/tmpl/rand.c                |  165 ++
 ext/numo/narray/gen/tmpl/rand_norm.c           |  119 ++
 ext/numo/narray/gen/tmpl/seq.c                 |   92 ++
 ext/numo/narray/gen/tmpl/set2.c                |   56 +
 ext/numo/narray/gen/tmpl/sort.c                |   47 +
 ext/numo/narray/gen/tmpl/sort_index.c          |  102 ++
 ext/numo/narray/gen/tmpl/store.c               |   41 +
 ext/numo/narray/gen/tmpl/store_array.c         |  102 ++
 ext/numo/narray/gen/tmpl/store_bit.c           |   55 +
 ext/numo/narray/gen/tmpl/store_from.c          |   53 +
 ext/numo/narray/gen/tmpl/store_numeric.c       |    9 +
 ext/numo/narray/gen/tmpl/to_a.c                |   41 +
 ext/numo/narray/gen/tmpl/unary.c               |   58 +
 ext/numo/narray/gen/tmpl/unary2.c              |   58 +
 ext/numo/narray/gen/tmpl/unary_ret2.c          |   33 +
 ext/numo/narray/gen/tmpl/unary_s.c             |   57 +
 ext/numo/narray/gen/tmpl_bit/allocate.c        |   24 +
 ext/numo/narray/gen/tmpl_bit/aref.c            |   55 +
 ext/numo/narray/gen/tmpl_bit/aset.c            |   65 +
 ext/numo/narray/gen/tmpl_bit/binary.c          |   94 ++
 ext/numo/narray/gen/tmpl_bit/bit_count.c       |   85 +
 ext/numo/narray/gen/tmpl_bit/bit_reduce.c      |  129 ++
 ext/numo/narray/gen/tmpl_bit/each.c            |   44 +
 ext/numo/narray/gen/tmpl_bit/each_with_index.c |   66 +
 ext/numo/narray/gen/tmpl_bit/extract.c         |   25 +
 ext/numo/narray/gen/tmpl_bit/fill.c            |   65 +
 ext/numo/narray/gen/tmpl_bit/format.c          |   61 +
 ext/numo/narray/gen/tmpl_bit/format_to_a.c     |   48 +
 ext/numo/narray/gen/tmpl_bit/inspect.c         |   18 +
 ext/numo/narray/gen/tmpl_bit/mask.c            |  132 ++
 ext/numo/narray/gen/tmpl_bit/none_p.c          |   14 +
 ext/numo/narray/gen/tmpl_bit/store_array.c     |  104 ++
 ext/numo/narray/gen/tmpl_bit/store_bit.c       |   66 +
 ext/numo/narray/gen/tmpl_bit/store_from.c      |   56 +
 ext/numo/narray/gen/tmpl_bit/to_a.c            |   43 +
 ext/numo/narray/gen/tmpl_bit/unary.c           |   77 +
 ext/numo/narray/gen/tmpl_bit/where.c           |   86 ++
 ext/numo/narray/gen/tmpl_bit/where2.c          |   91 ++
 ext/numo/narray/index.c                        |  842 ++++++++++
 ext/numo/narray/math.c                         |  147 ++
 ext/numo/narray/narray.c                       | 1975 ++++++++++++++++++++++++
 ext/numo/narray/ndloop.c                       | 1961 +++++++++++++++++++++++
 ext/numo/narray/numo/compat.h                  |   23 +
 ext/numo/narray/numo/intern.h                  |  109 ++
 ext/numo/narray/numo/narray.h                  |  429 +++++
 ext/numo/narray/numo/ndloop.h                  |   94 ++
 ext/numo/narray/numo/template.h                |  136 ++
 ext/numo/narray/numo/types/bit.h               |   33 +
 ext/numo/narray/numo/types/complex.h           |  409 +++++
 ext/numo/narray/numo/types/complex_macro.h     |  375 +++++
 ext/numo/narray/numo/types/dcomplex.h          |   44 +
 ext/numo/narray/numo/types/dfloat.h            |   42 +
 ext/numo/narray/numo/types/float_def.h         |   34 +
 ext/numo/narray/numo/types/float_macro.h       |  186 +++
 ext/numo/narray/numo/types/int16.h             |   21 +
 ext/numo/narray/numo/types/int32.h             |   21 +
 ext/numo/narray/numo/types/int64.h             |   21 +
 ext/numo/narray/numo/types/int8.h              |   21 +
 ext/numo/narray/numo/types/int_macro.h         |   35 +
 ext/numo/narray/numo/types/real_accum.h        |  440 ++++++
 ext/numo/narray/numo/types/robj_macro.h        |   75 +
 ext/numo/narray/numo/types/robject.h           |   27 +
 ext/numo/narray/numo/types/scomplex.h          |   44 +
 ext/numo/narray/numo/types/sfloat.h            |   43 +
 ext/numo/narray/numo/types/uint16.h            |   18 +
 ext/numo/narray/numo/types/uint32.h            |   18 +
 ext/numo/narray/numo/types/uint64.h            |   18 +
 ext/numo/narray/numo/types/uint8.h             |   18 +
 ext/numo/narray/numo/types/uint_macro.h        |   32 +
 ext/numo/narray/numo/types/xint_macro.h        |  173 +++
 ext/numo/narray/rand.c                         |   72 +
 ext/numo/narray/step.c                         |  501 ++++++
 ext/numo/narray/struct.c                       |  885 +++++++++++
 lib/erbpp.rb                                   |  294 ++++
 lib/erbpp/line_number.rb                       |  133 ++
 lib/erbpp/narray_def.rb                        |  381 +++++
 lib/numo/narray.rb                             |    8 +
 lib/numo/narray/extra.rb                       | 1262 +++++++++++++++
 numo-narray.gemspec                            |   36 +
 spec/bit_spec.rb                               |   93 ++
 spec/narray_spec.rb                            |  250 +++
 152 files changed, 20930 insertions(+)

diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..033d820
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+
+# Specify your gem's dependencies in narray-devel.gemspec
+gemspec
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9f2004d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,60 @@
+# Numo::NArray - New NArray class library for Ruby/Numo (NUmerical MOdule)
+
+[![Binder](http://mybinder.org/badge.svg)](http://mybinder.org/repo/ruby-numo/narray)
+[![Build Status](https://travis-ci.org/ruby-numo/narray.svg?branch=master)](https://travis-ci.org/ruby-numo/narray)
+
+[GitHub](https://github.com/ruby-numo/narray)
+ | [RubyGems](https://rubygems.org/gems/numo-narray)
+
+Numo::NArray is an Numerical N-dimensional Array class
+for fast processing and easy manipulation of multi-dimensional numerical data,
+similar to numpy.ndaray.
+This project is the successor to [Ruby/NArray](http://masa16.github.io/narray/).
+
+under development
+
+## Documentation
+All documents are primitive.
+
+* [Numo::NArray API Doc](http://ruby-numo.github.io/narray/narray/frames.html)
+* [Numo::NArray vs numpy](https://github.com/ruby-numo/narray/wiki/Numo-vs-numpy)
+* [Numo::NArray vs ndarray](https://github.com/ruby-numo/narray/wiki/Numo-vs-ndarray)
+* [Numo::NArray Overview](https://github.com/ruby-numo/narray/wiki/Numo::NArray%E6%A6%82%E8%A6%81) (in Japanese)
+
+## Related Projects
+* [Numo::Linalg](https://github.com/ruby-numo/linalg) - Linear Algebra library with [LAPACK](http://www.netlib.org/lapack/).
+* [Numo::GSL](https://github.com/ruby-numo/gsl) - Ruby interface for [GSL (GNU Scientific Library)](http://www.gnu.org/software/gsl/).
+* [Numo::FFTE](https://github.com/ruby-numo/ffte) - Ruby interface for [FFTE (A Fast Fourier Transform library with radix-2,3,5)](http://www.ffte.jp/).
+* [Numo::Gnuplot](https://github.com/ruby-numo/gnuplot) - Simple and easy-to-use Gnuplot interface.
+
+## Installation
+### Requirement
+Ruby ver 2.1 and later.
+
+### Ubuntu, Debian, Bash on Windows
+```shell
+apt install -y git ruby gcc ruby-dev rake make
+gem install specific_install
+gem specific_install https://github.com/ruby-numo/narray.git
+```
+
+## Quick start
+An example
+```ruby
+[1] pry(main)> require "numo/narray"
+=> true
+[2] pry(main)> a = Numo::DFloat.new(3,5).seq
+=> Numo::DFloat#shape=[3,5]
+[[0, 1, 2, 3, 4],
+ [5, 6, 7, 8, 9],
+ [10, 11, 12, 13, 14]]
+[3] pry(main)> a.shape
+=> [3, 5]
+[4] pry(main)> a.ndim
+=> 2
+[5] pry(main)> a.class
+=> Numo::DFloat
+[6] pry(main)> a.size
+=> 15
+```
+For more examples, check out this [narray version of 100 numpy exercises](https://github.com/ruby-numo/narray/wiki/100-narray-exercises) (and the [IRuby Notebook](https://github.com/ruby-numo/narray/blob/master/100-narray-exercises.ipynb)).
diff --git a/Rakefile b/Rakefile
new file mode 100644
index 0000000..004e2e9
--- /dev/null
+++ b/Rakefile
@@ -0,0 +1,66 @@
+require "bundler/gem_tasks"
+begin
+
+task :doc do
+  dir = "ext/numo/narray"
+  src = %w[array.c data.c index.c math.c narray.c rand.c struct.c].
+    map{|s| File.join(dir,s)} +
+    [File.join(dir,"types/*.c"), "lib/numo/narray/extra.rb"]
+  sh "cd ext/numo/narray; ruby extconf.rb; make src"
+  sh "rm -rf yard .yardoc; yard doc -o yard -m markdown -r README.md #{src.join(' ')}"
+end
+
+require "rake/extensiontask"
+require "rake_compiler_dock"
+require "shellwords"
+
+spec = Bundler::GemHelper.gemspec
+
+cross_platforms = ["x86-mingw32", "x64-mingw32"]
+Rake::ExtensionTask.new("numo/narray", spec) do |ext|
+  ext.cross_compile = true
+  ext.cross_platform = cross_platforms
+end
+
+pkg_dir = "pkg"
+windows_gem_paths = cross_platforms.collect do |platform|
+  File.join(pkg_dir, "#{spec.full_name}-#{platform}.gem")
+end
+
+namespace :build do
+  directory pkg_dir
+
+  desc "Build gems for Windows into the pkg directory"
+  task :windows => pkg_dir do
+    build_dir = "tmp/windows"
+    rm_rf build_dir
+    mkdir_p build_dir
+
+    commands = [
+      ["git", "clone", "file://#{Dir.pwd}/.git", build_dir],
+      ["cd", build_dir],
+      ["bundle"],
+      ["rake", "cross", "native", "gem"],
+    ]
+    raw_commands = commands.collect do |command|
+      Shellwords.join(command)
+    end
+    raw_command_line = raw_commands.join(" && ")
+
+    RakeCompilerDock.sh(raw_command_line)
+
+    cp(Dir.glob("#{build_dir}/#{pkg_dir}/*.gem"),
+       "#{pkg_dir}/")
+  end
+end
+
+namespace :release do
+  task :windows => "build:windows" do
+    windows_gem_paths.each do |path|
+      ruby("-S", "gem", "push", path)
+    end
+  end
+end
+
+rescue LoadError
+end
diff --git a/ext/numo/narray/SFMT-params.h b/ext/numo/narray/SFMT-params.h
new file mode 100644
index 0000000..cda2ccc
--- /dev/null
+++ b/ext/numo/narray/SFMT-params.h
@@ -0,0 +1,97 @@
+#ifndef SFMT_PARAMS_H
+#define SFMT_PARAMS_H
+
+#if !defined(MEXP)
+//#ifdef __GNUC__
+//  #warning "MEXP is not defined. I assume MEXP is 19937."
+//#endif
+  #define MEXP 19937
+#endif
+/*-----------------
+  BASIC DEFINITIONS
+  -----------------*/
+/** Mersenne Exponent. The period of the sequence 
+ *  is a multiple of 2^MEXP-1.
+ * #define MEXP 19937 */
+/** SFMT generator has an internal state array of 128-bit integers,
+ * and N is its size. */
+#define N (MEXP / 128 + 1)
+/** N32 is the size of internal state array when regarded as an array
+ * of 32-bit integers.*/
+#define N32 (N * 4)
+/** N64 is the size of internal state array when regarded as an array
+ * of 64-bit integers.*/
+#define N64 (N * 2)
+
+/*----------------------
+  the parameters of SFMT
+  following definitions are in paramsXXXX.h file.
+  ----------------------*/
+/** the pick up position of the array.
+#define POS1 122 
+*/
+
+/** the parameter of shift left as four 32-bit registers.
+#define SL1 18
+ */
+
+/** the parameter of shift left as one 128-bit register. 
+ * The 128-bit integer is shifted by (SL2 * 8) bits. 
+#define SL2 1 
+*/
+
+/** the parameter of shift right as four 32-bit registers.
+#define SR1 11
+*/
+
+/** the parameter of shift right as one 128-bit register. 
+ * The 128-bit integer is shifted by (SL2 * 8) bits. 
+#define SR2 1 
+*/
+
+/** A bitmask, used in the recursion.  These parameters are introduced
+ * to break symmetry of SIMD.
+#define MSK1 0xdfffffefU
+#define MSK2 0xddfecb7fU
+#define MSK3 0xbffaffffU
+#define MSK4 0xbffffff6U 
+*/
+
+/** These definitions are part of a 128-bit period certification vector.
+#define PARITY1	0x00000001U
+#define PARITY2	0x00000000U
+#define PARITY3	0x00000000U
+#define PARITY4	0xc98e126aU
+*/
+
+#if MEXP == 607
+  #include "SFMT-params607.h"
+#elif MEXP == 1279
+  #include "SFMT-params1279.h"
+#elif MEXP == 2281
+  #include "SFMT-params2281.h"
+#elif MEXP == 4253
+  #include "SFMT-params4253.h"
+#elif MEXP == 11213
+  #include "SFMT-params11213.h"
+#elif MEXP == 19937
+  #include "SFMT-params19937.h"
+#elif MEXP == 44497
+  #include "SFMT-params44497.h"
+#elif MEXP == 86243
+  #include "SFMT-params86243.h"
+#elif MEXP == 132049
+  #include "SFMT-params132049.h"
+#elif MEXP == 216091
+  #include "SFMT-params216091.h"
+#else
+#ifdef __GNUC__
+  #error "MEXP is not valid."
+  #undef MEXP
+#else
+  #undef MEXP
+#endif
+
+#endif
+
+#endif /* SFMT_PARAMS_H */
diff --git a/ext/numo/narray/SFMT-params19937.h b/ext/numo/narray/SFMT-params19937.h
new file mode 100644
index 0000000..04708cd
--- /dev/null
+++ b/ext/numo/narray/SFMT-params19937.h
@@ -0,0 +1,46 @@
+#ifndef SFMT_PARAMS19937_H
+#define SFMT_PARAMS19937_H
+
+#define POS1	122
+#define SL1	18
+#define SL2	1
+#define SR1	11
+#define SR2	1
+#define MSK1	0xdfffffefU
+#define MSK2	0xddfecb7fU
+#define MSK3	0xbffaffffU
+#define MSK4	0xbffffff6U
+#define PARITY1	0x00000001U
+#define PARITY2	0x00000000U
+#define PARITY3	0x00000000U
+#define PARITY4	0x13c9e684U
+
+
+/* PARAMETERS FOR ALTIVEC */
+#if defined(__APPLE__)	/* For OSX */
+    #define ALTI_SL1	(vector unsigned int)(SL1, SL1, SL1, SL1)
+    #define ALTI_SR1	(vector unsigned int)(SR1, SR1, SR1, SR1)
+    #define ALTI_MSK	(vector unsigned int)(MSK1, MSK2, MSK3, MSK4)
+    #define ALTI_MSK64 \
+	(vector unsigned int)(MSK2, MSK1, MSK4, MSK3)
+    #define ALTI_SL2_PERM \
+	(vector unsigned char)(1,2,3,23,5,6,7,0,9,10,11,4,13,14,15,8)
+    #define ALTI_SL2_PERM64 \
+	(vector unsigned char)(1,2,3,4,5,6,7,31,9,10,11,12,13,14,15,0)
+    #define ALTI_SR2_PERM \
+	(vector unsigned char)(7,0,1,2,11,4,5,6,15,8,9,10,17,12,13,14)
+    #define ALTI_SR2_PERM64 \
+	(vector unsigned char)(15,0,1,2,3,4,5,6,17,8,9,10,11,12,13,14)
+#else	/* For OTHER OSs(Linux?) */
+    #define ALTI_SL1	{SL1, SL1, SL1, SL1}
+    #define ALTI_SR1	{SR1, SR1, SR1, SR1}
+    #define ALTI_MSK	{MSK1, MSK2, MSK3, MSK4}
+    #define ALTI_MSK64	{MSK2, MSK1, MSK4, MSK3}
+    #define ALTI_SL2_PERM	{1,2,3,23,5,6,7,0,9,10,11,4,13,14,15,8}
+    #define ALTI_SL2_PERM64	{1,2,3,4,5,6,7,31,9,10,11,12,13,14,15,0}
+    #define ALTI_SR2_PERM	{7,0,1,2,11,4,5,6,15,8,9,10,17,12,13,14}
+    #define ALTI_SR2_PERM64	{15,0,1,2,3,4,5,6,17,8,9,10,11,12,13,14}
+#endif	/* For OSX */
+#define IDSTR	"SFMT-19937:122-18-1-11-1:dfffffef-ddfecb7f-bffaffff-bffffff6"
+
+#endif /* SFMT_PARAMS19937_H */
diff --git a/ext/numo/narray/SFMT.c b/ext/numo/narray/SFMT.c
new file mode 100644
index 0000000..d36465d
--- /dev/null
+++ b/ext/numo/narray/SFMT.c
@@ -0,0 +1,620 @@
+/** 
+ * @file  SFMT.c
+ * @brief SIMD oriented Fast Mersenne Twister(SFMT)
+ *
+ * @author Mutsuo Saito (Hiroshima University)
+ * @author Makoto Matsumoto (Hiroshima University)
+ *
+ * Copyright (C) 2006,2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University. All rights reserved.
+ *
+ * The new BSD License is applied to this software, see LICENSE.txt
+ */
+#include <string.h>
+#include <assert.h>
+#include "SFMT.h"
+#include "SFMT-params.h"
+
+#if defined(__BIG_ENDIAN__) && !defined(__amd64) && !defined(BIG_ENDIAN64)
+#define BIG_ENDIAN64 1
+#endif
+#if defined(HAVE_ALTIVEC) && !defined(BIG_ENDIAN64)
+#define BIG_ENDIAN64 1
+#endif
+#if defined(ONLY64) && !defined(BIG_ENDIAN64)
+  #if defined(__GNUC__)
+    #error "-DONLY64 must be specified with -DBIG_ENDIAN64"
+  #endif
+#undef ONLY64
+#endif
+/*------------------------------------------------------
+  128-bit SIMD data type for Altivec, SSE2 or standard C
+  ------------------------------------------------------*/
+#if defined(HAVE_ALTIVEC)
+  #if !defined(__APPLE__)
+    #include <altivec.h>
+  #endif
+/** 128-bit data structure */
+union W128_T {
+    vector unsigned int s;
+    uint32_t u[4];
+};
+/** 128-bit data type */
+typedef union W128_T w128_t;
+
+#elif defined(HAVE_SSE2)
+  #include <emmintrin.h>
+
+/** 128-bit data structure */
+union W128_T {
+    __m128i si;
+    uint32_t u[4];
+};
+/** 128-bit data type */
+typedef union W128_T w128_t;
+
+#else
+
+/** 128-bit data structure */
+struct W128_T {
+    uint32_t u[4];
+};
+/** 128-bit data type */
+typedef struct W128_T w128_t;
+
+#endif
+
+/*--------------------------------------
+  FILE GLOBAL VARIABLES
+  internal state, index counter and flag 
+  --------------------------------------*/
+/** the 128-bit internal state array */
+static w128_t sfmt[N];
+/** the 32bit integer pointer to the 128-bit internal state array */
+static uint32_t *psfmt32 = &sfmt[0].u[0];
+#if !defined(BIG_ENDIAN64) || defined(ONLY64)
+/** the 64bit integer pointer to the 128-bit internal state array */
+static uint64_t *psfmt64 = (uint64_t *)&sfmt[0].u[0];
+#endif
+/** index counter to the 32-bit internal state array */
+static int idx;
+/** a flag: it is 0 if and only if the internal state is not yet
+ * initialized. */
+static int initialized = 0;
+/** a parity check vector which certificate the period of 2^{MEXP} */
+static uint32_t parity[4] = {PARITY1, PARITY2, PARITY3, PARITY4};
+
+/*----------------
+  STATIC FUNCTIONS
+  ----------------*/
+inline static int idxof(int i);
+inline static void rshift128(w128_t *out,  w128_t const *in, int shift);
+inline static void lshift128(w128_t *out,  w128_t const *in, int shift);
+inline static void gen_rand_all(void);
+inline static void gen_rand_array(w128_t *array, int size);
+inline static uint32_t func1(uint32_t x);
+inline static uint32_t func2(uint32_t x);
+static void period_certification(void);
+#if defined(BIG_ENDIAN64) && !defined(ONLY64)
+inline static void swap(w128_t *array, int size);
+#endif
+
+#if defined(HAVE_ALTIVEC)
+  #include "SFMT-alti.h"
+#elif defined(HAVE_SSE2)
+  #include "SFMT-sse2.h"
+#endif
+
+/**
+ * This function simulate a 64-bit index of LITTLE ENDIAN 
+ * in BIG ENDIAN machine.
+ */
+#ifdef ONLY64
+inline static int idxof(int i) {
+    return i ^ 1;
+}
+#else
+inline static int idxof(int i) {
+    return i;
+}
+#endif
+/**
+ * This function simulates SIMD 128-bit right shift by the standard C.
+ * The 128-bit integer given in in is shifted by (shift * 8) bits.
+ * This function simulates the LITTLE ENDIAN SIMD.
+ * @param out the output of this function
+ * @param in the 128-bit data to be shifted
+ * @param shift the shift value
+ */
+#ifdef ONLY64
+inline static void rshift128(w128_t *out, w128_t const *in, int shift) {
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[2] << 32) | ((uint64_t)in->u[3]);
+    tl = ((uint64_t)in->u[0] << 32) | ((uint64_t)in->u[1]);
+
+    oh = th >> (shift * 8);
+    ol = tl >> (shift * 8);
+    ol |= th << (64 - shift * 8);
+    out->u[0] = (uint32_t)(ol >> 32);
+    out->u[1] = (uint32_t)ol;
+    out->u[2] = (uint32_t)(oh >> 32);
+    out->u[3] = (uint32_t)oh;
+}
+#else
+inline static void rshift128(w128_t *out, w128_t const *in, int shift) {
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[3] << 32) | ((uint64_t)in->u[2]);
+    tl = ((uint64_t)in->u[1] << 32) | ((uint64_t)in->u[0]);
+
+    oh = th >> (shift * 8);
+    ol = tl >> (shift * 8);
+    ol |= th << (64 - shift * 8);
+    out->u[1] = (uint32_t)(ol >> 32);
+    out->u[0] = (uint32_t)ol;
+    out->u[3] = (uint32_t)(oh >> 32);
+    out->u[2] = (uint32_t)oh;
+}
+#endif
+/**
+ * This function simulates SIMD 128-bit left shift by the standard C.
+ * The 128-bit integer given in in is shifted by (shift * 8) bits.
+ * This function simulates the LITTLE ENDIAN SIMD.
+ * @param out the output of this function
+ * @param in the 128-bit data to be shifted
+ * @param shift the shift value
+ */
+#ifdef ONLY64
+inline static void lshift128(w128_t *out, w128_t const *in, int shift) {
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[2] << 32) | ((uint64_t)in->u[3]);
+    tl = ((uint64_t)in->u[0] << 32) | ((uint64_t)in->u[1]);
+
+    oh = th << (shift * 8);
+    ol = tl << (shift * 8);
+    oh |= tl >> (64 - shift * 8);
+    out->u[0] = (uint32_t)(ol >> 32);
+    out->u[1] = (uint32_t)ol;
+    out->u[2] = (uint32_t)(oh >> 32);
+    out->u[3] = (uint32_t)oh;
+}
+#else
+inline static void lshift128(w128_t *out, w128_t const *in, int shift) {
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[3] << 32) | ((uint64_t)in->u[2]);
+    tl = ((uint64_t)in->u[1] << 32) | ((uint64_t)in->u[0]);
+
+    oh = th << (shift * 8);
+    ol = tl << (shift * 8);
+    oh |= tl >> (64 - shift * 8);
+    out->u[1] = (uint32_t)(ol >> 32);
+    out->u[0] = (uint32_t)ol;
+    out->u[3] = (uint32_t)(oh >> 32);
+    out->u[2] = (uint32_t)oh;
+}
+#endif
+
+/**
+ * This function represents the recursion formula.
+ * @param r output
+ * @param a a 128-bit part of the internal state array
+ * @param b a 128-bit part of the internal state array
+ * @param c a 128-bit part of the internal state array
+ * @param d a 128-bit part of the internal state array
+ */
+#if (!defined(HAVE_ALTIVEC)) && (!defined(HAVE_SSE2))
+#ifdef ONLY64
+inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b, w128_t *c,
+				w128_t *d) {
+    w128_t x;
+    w128_t y;
+
+    lshift128(&x, a, SL2);
+    rshift128(&y, c, SR2);
+    r->u[0] = a->u[0] ^ x.u[0] ^ ((b->u[0] >> SR1) & MSK2) ^ y.u[0] 
+	^ (d->u[0] << SL1);
+    r->u[1] = a->u[1] ^ x.u[1] ^ ((b->u[1] >> SR1) & MSK1) ^ y.u[1] 
+	^ (d->u[1] << SL1);
+    r->u[2] = a->u[2] ^ x.u[2] ^ ((b->u[2] >> SR1) & MSK4) ^ y.u[2] 
+	^ (d->u[2] << SL1);
+    r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> SR1) & MSK3) ^ y.u[3] 
+	^ (d->u[3] << SL1);
+}
+#else
+inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b, w128_t *c,
+				w128_t *d) {
+    w128_t x;
+    w128_t y;
+
+    lshift128(&x, a, SL2);
+    rshift128(&y, c, SR2);
+    r->u[0] = a->u[0] ^ x.u[0] ^ ((b->u[0] >> SR1) & MSK1) ^ y.u[0] 
+	^ (d->u[0] << SL1);
+    r->u[1] = a->u[1] ^ x.u[1] ^ ((b->u[1] >> SR1) & MSK2) ^ y.u[1] 
+	^ (d->u[1] << SL1);
+    r->u[2] = a->u[2] ^ x.u[2] ^ ((b->u[2] >> SR1) & MSK3) ^ y.u[2] 
+	^ (d->u[2] << SL1);
+    r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> SR1) & MSK4) ^ y.u[3] 
+	^ (d->u[3] << SL1);
+}
+#endif
+#endif
+
+#if (!defined(HAVE_ALTIVEC)) && (!defined(HAVE_SSE2))
+/**
+ * This function fills the internal state array with pseudorandom
+ * integers.
+ */
+inline static void gen_rand_all(void) {
+    int i;
+    w128_t *r1, *r2;
+
+    r1 = &sfmt[N - 2];
+    r2 = &sfmt[N - 1];
+    for (i = 0; i < N - POS1; i++) {
+	do_recursion(&sfmt[i], &sfmt[i], &sfmt[i + POS1], r1, r2);
+	r1 = r2;
+	r2 = &sfmt[i];
+    }
+    for (; i < N; i++) {
+	do_recursion(&sfmt[i], &sfmt[i], &sfmt[i + POS1 - N], r1, r2);
+	r1 = r2;
+	r2 = &sfmt[i];
+    }
+}
+
+/**
+ * This function fills the user-specified array with pseudorandom
+ * integers.
+ *
+ * @param array an 128-bit array to be filled by pseudorandom numbers.  
+ * @param size number of 128-bit pseudorandom numbers to be generated.
+ */
+inline static void gen_rand_array(w128_t *array, int size) {
+    int i, j;
+    w128_t *r1, *r2;
+
+    r1 = &sfmt[N - 2];
+    r2 = &sfmt[N - 1];
+    for (i = 0; i < N - POS1; i++) {
+	do_recursion(&array[i], &sfmt[i], &sfmt[i + POS1], r1, r2);
+	r1 = r2;
+	r2 = &array[i];
+    }
+    for (; i < N; i++) {
+	do_recursion(&array[i], &sfmt[i], &array[i + POS1 - N], r1, r2);
+	r1 = r2;
+	r2 = &array[i];
+    }
+    for (; i < size - N; i++) {
+	do_recursion(&array[i], &array[i - N], &array[i + POS1 - N], r1, r2);
+	r1 = r2;
+	r2 = &array[i];
+    }
+    for (j = 0; j < 2 * N - size; j++) {
+	sfmt[j] = array[j + size - N];
+    }
+    for (; i < size; i++, j++) {
+	do_recursion(&array[i], &array[i - N], &array[i + POS1 - N], r1, r2);
+	r1 = r2;
+	r2 = &array[i];
+	sfmt[j] = array[i];
+    }
+}
+#endif
+
+#if defined(BIG_ENDIAN64) && !defined(ONLY64) && !defined(HAVE_ALTIVEC)
+inline static void swap(w128_t *array, int size) {
+    int i;
+    uint32_t x, y;
+
+    for (i = 0; i < size; i++) {
+	x = array[i].u[0];
+	y = array[i].u[2];
+	array[i].u[0] = array[i].u[1];
+	array[i].u[2] = array[i].u[3];
+	array[i].u[1] = x;
+	array[i].u[3] = y;
+    }
+}
+#endif
+/**
+ * This function represents a function used in the initialization
+ * by init_by_array
+ * @param x 32-bit integer
+ * @return 32-bit integer
+ */
+static uint32_t func1(uint32_t x) {
+    return (x ^ (x >> 27)) * (uint32_t)1664525UL;
+}
+
+/**
+ * This function represents a function used in the initialization
+ * by init_by_array
+ * @param x 32-bit integer
+ * @return 32-bit integer
+ */
+static uint32_t func2(uint32_t x) {
+    return (x ^ (x >> 27)) * (uint32_t)1566083941UL;
+}
+
+/**
+ * This function certificate the period of 2^{MEXP}
+ */
+static void period_certification(void) {
+    int inner = 0;
+    int i, j;
+    uint32_t work;
+
+    for (i = 0; i < 4; i++)
+	inner ^= psfmt32[idxof(i)] & parity[i];
+    for (i = 16; i > 0; i >>= 1)
+	inner ^= inner >> i;
+    inner &= 1;
+    /* check OK */
+    if (inner == 1) {
+	return;
+    }
+    /* check NG, and modification */
+    for (i = 0; i < 4; i++) {
+	work = 1;
+	for (j = 0; j < 32; j++) {
+	    if ((work & parity[i]) != 0) {
+		psfmt32[idxof(i)] ^= work;
+		return;
+	    }
+	    work = work << 1;
+	}
+    }
+}
+
+/*----------------
+  PUBLIC FUNCTIONS
+  ----------------*/
+/**
+ * This function returns the identification string.
+ * The string shows the word size, the Mersenne exponent,
+ * and all parameters of this generator.
+ */
+const char *get_idstring(void) {
+    return IDSTR;
+}
+
+/**
+ * This function returns the minimum size of array used for \b
+ * fill_array32() function.
+ * @return minimum size of array used for fill_array32() function.
+ */
+int get_min_array_size32(void) {
+    return N32;
+}
+
+/**
+ * This function returns the minimum size of array used for \b
+ * fill_array64() function.
+ * @return minimum size of array used for fill_array64() function.
+ */
+int get_min_array_size64(void) {
+    return N64;
+}
+
+#ifndef ONLY64
+/**
+ * This function generates and returns 32-bit pseudorandom number.
+ * init_gen_rand or init_by_array must be called before this function.
+ * @return 32-bit pseudorandom number
+ */
+uint32_t gen_rand32(void) {
+    uint32_t r;
+
+    assert(initialized);
+    if (idx >= N32) {
+	gen_rand_all();
+	idx = 0;
+    }
+    r = psfmt32[idx++];
+    return r;
+}
+#endif
+/**
+ * This function generates and returns 64-bit pseudorandom number.
+ * init_gen_rand or init_by_array must be called before this function.
+ * The function gen_rand64 should not be called after gen_rand32,
+ * unless an initialization is again executed. 
+ * @return 64-bit pseudorandom number
+ */
+uint64_t gen_rand64(void) {
+#if defined(BIG_ENDIAN64) && !defined(ONLY64)
+    uint32_t r1, r2;
+#else
+    uint64_t r;
+#endif
+
+    assert(initialized);
+    assert(idx % 2 == 0);
+
+    if (idx >= N32) {
+	gen_rand_all();
+	idx = 0;
+    }
+#if defined(BIG_ENDIAN64) && !defined(ONLY64)
+    r1 = psfmt32[idx];
+    r2 = psfmt32[idx + 1];
+    idx += 2;
+    return ((uint64_t)r2 << 32) | r1;
+#else
+    r = psfmt64[idx / 2];
+    idx += 2;
+    return r;
+#endif
+}
+
+#ifndef ONLY64
+/**
+ * This function generates pseudorandom 32-bit integers in the
+ * specified array[] by one call. The number of pseudorandom integers
+ * is specified by the argument size, which must be at least 624 and a
+ * multiple of four.  The generation by this function is much faster
+ * than the following gen_rand function.
+ *
+ * For initialization, init_gen_rand or init_by_array must be called
+ * before the first call of this function. This function can not be
+ * used after calling gen_rand function, without initialization.
+ *
+ * @param array an array where pseudorandom 32-bit integers are filled
+ * by this function.  The pointer to the array must be \b "aligned"
+ * (namely, must be a multiple of 16) in the SIMD version, since it
+ * refers to the address of a 128-bit integer.  In the standard C
+ * version, the pointer is arbitrary.
+ *
+ * @param size the number of 32-bit pseudorandom integers to be
+ * generated.  size must be a multiple of 4, and greater than or equal
+ * to (MEXP / 128 + 1) * 4.
+ *
+ * @note \b memalign or \b posix_memalign is available to get aligned
+ * memory. Mac OSX doesn't have these functions, but \b malloc of OSX
+ * returns the pointer to the aligned memory block.
+ */
+void fill_array32(uint32_t *array, int size) {
+    assert(initialized);
+    assert(idx == N32);
+    assert(size % 4 == 0);
+    assert(size >= N32);
+
+    gen_rand_array((w128_t *)array, size / 4);
+    idx = N32;
+}
+#endif
+
+/**
+ * This function generates pseudorandom 64-bit integers in the
+ * specified array[] by one call. The number of pseudorandom integers
+ * is specified by the argument size, which must be at least 312 and a
+ * multiple of two.  The generation by this function is much faster
+ * than the following gen_rand function.
+ *
+ * For initialization, init_gen_rand or init_by_array must be called
+ * before the first call of this function. This function can not be
+ * used after calling gen_rand function, without initialization.
+ *
+ * @param array an array where pseudorandom 64-bit integers are filled
+ * by this function.  The pointer to the array must be "aligned"
+ * (namely, must be a multiple of 16) in the SIMD version, since it
+ * refers to the address of a 128-bit integer.  In the standard C
+ * version, the pointer is arbitrary.
+ *
+ * @param size the number of 64-bit pseudorandom integers to be
+ * generated.  size must be a multiple of 2, and greater than or equal
+ * to (MEXP / 128 + 1) * 2
+ *
+ * @note \b memalign or \b posix_memalign is available to get aligned
+ * memory. Mac OSX doesn't have these functions, but \b malloc of OSX
+ * returns the pointer to the aligned memory block.
+ */
+void fill_array64(uint64_t *array, int size) {
+    assert(initialized);
+    assert(idx == N32);
+    assert(size % 2 == 0);
+    assert(size >= N64);
+
+    gen_rand_array((w128_t *)array, size / 2);
+    idx = N32;
+
+#if defined(BIG_ENDIAN64) && !defined(ONLY64)
+    swap((w128_t *)array, size /2);
+#endif
+}
+
+/**
+ * This function initializes the internal state array with a 32-bit
+ * integer seed.
+ *
+ * @param seed a 32-bit integer used as the seed.
+ */
+void init_gen_rand(uint32_t seed) {
+    int i;
+
+    psfmt32[idxof(0)] = seed;
+    for (i = 1; i < N32; i++) {
+	psfmt32[idxof(i)] = 1812433253UL * (psfmt32[idxof(i - 1)] 
+					    ^ (psfmt32[idxof(i - 1)] >> 30))
+	    + i;
+    }
+    idx = N32;
+    period_certification();
+    initialized = 1;
+}
+
+/**
+ * This function initializes the internal state array,
+ * with an array of 32-bit integers used as the seeds
+ * @param init_key the array of 32-bit integers, used as a seed.
+ * @param key_length the length of init_key.
+ */
+void init_by_array(uint32_t *init_key, int key_length) {
+    int i, j, count;
+    uint32_t r;
+    int lag;
+    int mid;
+    int size = N * 4;
+
+    if (size >= 623) {
+	lag = 11;
+    } else if (size >= 68) {
+	lag = 7;
+    } else if (size >= 39) {
+	lag = 5;
+    } else {
+	lag = 3;
+    }
+    mid = (size - lag) / 2;
+
+    memset(sfmt, 0x8b, sizeof(sfmt));
+    if (key_length + 1 > N32) {
+	count = key_length + 1;
+    } else {
+	count = N32;
+    }
+    r = func1(psfmt32[idxof(0)] ^ psfmt32[idxof(mid)] 
+	      ^ psfmt32[idxof(N32 - 1)]);
+    psfmt32[idxof(mid)] += r;
+    r += key_length;
+    psfmt32[idxof(mid + lag)] += r;
+    psfmt32[idxof(0)] = r;
+
+    count--;
+    for (i = 1, j = 0; (j < count) && (j < key_length); j++) {
+	r = func1(psfmt32[idxof(i)] ^ psfmt32[idxof((i + mid) % N32)] 
+		  ^ psfmt32[idxof((i + N32 - 1) % N32)]);
+	psfmt32[idxof((i + mid) % N32)] += r;
+	r += init_key[j] + i;
+	psfmt32[idxof((i + mid + lag) % N32)] += r;
+	psfmt32[idxof(i)] = r;
+	i = (i + 1) % N32;
+    }
+    for (; j < count; j++) {
+	r = func1(psfmt32[idxof(i)] ^ psfmt32[idxof((i + mid) % N32)] 
+		  ^ psfmt32[idxof((i + N32 - 1) % N32)]);
+	psfmt32[idxof((i + mid) % N32)] += r;
+	r += i;
+	psfmt32[idxof((i + mid + lag) % N32)] += r;
+	psfmt32[idxof(i)] = r;
+	i = (i + 1) % N32;
+    }
+    for (j = 0; j < N32; j++) {
+	r = func2(psfmt32[idxof(i)] + psfmt32[idxof((i + mid) % N32)] 
+		  + psfmt32[idxof((i + N32 - 1) % N32)]);
+	psfmt32[idxof((i + mid) % N32)] ^= r;
+	r -= i;
+	psfmt32[idxof((i + mid + lag) % N32)] ^= r;
+	psfmt32[idxof(i)] = r;
+	i = (i + 1) % N32;
+    }
+
+    idx = N32;
+    period_certification();
+    initialized = 1;
+}
diff --git a/ext/numo/narray/SFMT.h b/ext/numo/narray/SFMT.h
new file mode 100644
index 0000000..7c8b35e
--- /dev/null
+++ b/ext/numo/narray/SFMT.h
@@ -0,0 +1,157 @@
+/** 
+ * @file SFMT.h 
+ *
+ * @brief SIMD oriented Fast Mersenne Twister(SFMT) pseudorandom
+ * number generator
+ *
+ * @author Mutsuo Saito (Hiroshima University)
+ * @author Makoto Matsumoto (Hiroshima University)
+ *
+ * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University. All rights reserved.
+ *
+ * The new BSD License is applied to this software.
+ * see LICENSE.txt
+ *
+ * @note We assume that your system has inttypes.h.  If your system
+ * doesn't have inttypes.h, you have to typedef uint32_t and uint64_t,
+ * and you have to define PRIu64 and PRIx64 in this file as follows:
+ * @verbatim
+ typedef unsigned int uint32_t
+ typedef unsigned long long uint64_t  
+ #define PRIu64 "llu"
+ #define PRIx64 "llx"
+ at endverbatim
+ * uint32_t must be exactly 32-bit unsigned integer type (no more, no
+ * less), and uint64_t must be exactly 64-bit unsigned integer type.
+ * PRIu64 and PRIx64 are used for printf function to print 64-bit
+ * unsigned int and 64-bit unsigned int in hexadecimal format.
+ */
+
+#ifndef SFMT_H
+#define SFMT_H
+
+#include <stdio.h>
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+  #include <inttypes.h>
+#elif defined(_MSC_VER) || defined(__BORLANDC__)
+  typedef unsigned int uint32_t;
+  typedef unsigned __int64 uint64_t;
+  #define inline __inline
+#else
+  #include <inttypes.h>
+  #if defined(__GNUC__)
+    #define inline __inline__
+  #endif
+#endif
+
+#ifndef PRIu64
+  #if defined(_MSC_VER) || defined(__BORLANDC__)
+    #define PRIu64 "I64u"
+    #define PRIx64 "I64x"
+  #else
+    #define PRIu64 "llu"
+    #define PRIx64 "llx"
+  #endif
+#endif
+
+#if defined(__GNUC__)
+#define ALWAYSINLINE __attribute__((always_inline))
+#else
+#define ALWAYSINLINE
+#endif
+
+#if defined(_MSC_VER)
+  #if _MSC_VER >= 1200
+    #define PRE_ALWAYS __forceinline
+  #else
+    #define PRE_ALWAYS inline
+  #endif
+#else
+  #define PRE_ALWAYS inline
+#endif
+
+uint32_t gen_rand32(void);
+uint64_t gen_rand64(void);
+void fill_array32(uint32_t *array, int size);
+void fill_array64(uint64_t *array, int size);
+void init_gen_rand(uint32_t seed);
+void init_by_array(uint32_t *init_key, int key_length);
+const char *get_idstring(void);
+int get_min_array_size32(void);
+int get_min_array_size64(void);
+
+/* These real versions are due to Isaku Wada */
+/** generates a random number on [0,1]-real-interval */
+inline static double to_real1(uint32_t v)
+{
+    return v * (1.0/4294967295.0); 
+    /* divided by 2^32-1 */ 
+}
+
+/** generates a random number on [0,1]-real-interval */
+inline static double genrand_real1(void)
+{
+    return to_real1(gen_rand32());
+}
+
+/** generates a random number on [0,1)-real-interval */
+inline static double to_real2(uint32_t v)
+{
+    return v * (1.0/4294967296.0); 
+    /* divided by 2^32 */
+}
+
+/** generates a random number on [0,1)-real-interval */
+inline static double genrand_real2(void)
+{
+    return to_real2(gen_rand32());
+}
+
+/** generates a random number on (0,1)-real-interval */
+inline static double to_real3(uint32_t v)
+{
+    return (((double)v) + 0.5)*(1.0/4294967296.0); 
+    /* divided by 2^32 */
+}
+
+/** generates a random number on (0,1)-real-interval */
+inline static double genrand_real3(void)
+{
+    return to_real3(gen_rand32());
+}
+/** These real versions are due to Isaku Wada */
+
+/** generates a random number on [0,1) with 53-bit resolution*/
+inline static double to_res53(uint64_t v) 
+{ 
+    return v * (1.0/18446744073709551616.0L);
+}
+
+/** generates a random number on [0,1) with 53-bit resolution from two
+ * 32 bit integers */
+inline static double to_res53_mix(uint32_t x, uint32_t y) 
+{ 
+    return to_res53(x | ((uint64_t)y << 32));
+}
+
+/** generates a random number on [0,1) with 53-bit resolution
+ */
+inline static double genrand_res53(void) 
+{ 
+    return to_res53(gen_rand64());
+} 
+
+/** generates a random number on [0,1) with 53-bit resolution
+    using 32bit integer.
+ */
+inline static double genrand_res53_mix(void) 
+{ 
+    uint32_t x, y;
+
+    x = gen_rand32();
+    y = gen_rand32();
+    return to_res53_mix(x, y);
+} 
+#endif
diff --git a/ext/numo/narray/array.c b/ext/numo/narray/array.c
new file mode 100644
index 0000000..01d7352
--- /dev/null
+++ b/ext/numo/narray/array.c
@@ -0,0 +1,642 @@
+/*
+  array.c
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+#include <ruby.h>
+#include "numo/narray.h"
+
+// mdai: Multi-Dimensional Array Investigation
+typedef struct {
+  size_t shape;
+  VALUE  val;
+} na_mdai_item_t;
+
+typedef struct {
+    int   capa;
+    na_mdai_item_t *item;
+    int   type;    // Ruby numeric type - investigated separately
+    VALUE na_type;  // NArray type
+    VALUE int_max;
+} na_mdai_t;
+
+// Order of Ruby object.
+enum { NA_NONE, NA_BIT, NA_INT32, NA_INT64, NA_RATIONAL,
+       NA_DFLOAT, NA_DCOMPLEX, NA_ROBJ, NA_NTYPES };
+
+static ID id_begin;
+static ID id_end;
+static ID id_step;
+static ID id_abs;
+static ID id_cast;
+static ID id_le;
+static ID id_Complex;
+
+
+static VALUE
+ na_object_type(int type, VALUE v)
+{
+    static VALUE int32_max = Qnil;
+    if (NIL_P(int32_max))
+        int32_max = ULONG2NUM(2147483647);
+
+    switch(TYPE(v)) {
+
+    case T_TRUE:
+    case T_FALSE:
+        if (type<NA_BIT)
+            return NA_BIT;
+        return type;
+
+#if SIZEOF_LONG == 4
+    case T_FIXNUM:
+        if (type<NA_INT32)
+            return NA_INT32;
+        return type;
+    case T_BIGNUM:
+        if (type<NA_INT64) {
+            v = rb_funcall(v,id_abs,0);
+            if (RTEST(rb_funcall(v,id_le,1,int32_max))) {
+                if (type<NA_INT32)
+                    return NA_INT32;
+            } else {
+                return NA_INT64;
+            }
+        }
+        return type;
+
+#elif SIZEOF_LONG == 8
+    case T_FIXNUM:
+        if (type<NA_INT64) {
+            long x = NUM2LONG(v);
+            if (x<0) x=-x;
+            if (x<=2147483647) {
+                if (type<NA_INT32)
+                    return NA_INT32;
+            } else {
+                return NA_INT64;
+            }
+        }
+        return type;
+    case T_BIGNUM:
+        if (type<NA_INT64)
+            return NA_INT64;
+        return type;
+#else
+    case T_FIXNUM:
+    case T_BIGNUM:
+        if (type<NA_INT64) {
+            v = rb_funcall(v,id_abs,0);
+            if (RTEST(rb_funcall(v,id_le,1,int32_max))) {
+                if (type<NA_INT32)
+                    return NA_INT32;
+            } else {
+                return NA_INT64;
+            }
+        }
+        return type;
+#endif
+
+    case T_FLOAT:
+        if (type<NA_DFLOAT)
+            return NA_DFLOAT;
+        return type;
+
+    case T_NIL:
+        return type;
+
+    default:
+        if (CLASS_OF(v) == rb_const_get( rb_cObject, id_Complex )) {
+            return NA_DCOMPLEX;
+        }
+    }
+    return NA_ROBJ;
+}
+
+
+#define MDAI_ATTR_TYPE(tp,v,attr)                               \
+    {tp = na_object_type(tp,rb_funcall(v,id_##attr,0));}
+
+static int na_mdai_object_type(int type, VALUE v)
+{
+    if (rb_obj_is_kind_of(v, rb_cRange)) {
+        MDAI_ATTR_TYPE(type,v,begin);
+        MDAI_ATTR_TYPE(type,v,end);
+    } else if (rb_obj_is_kind_of(v, na_cStep)) {
+        MDAI_ATTR_TYPE(type,v,begin);
+        MDAI_ATTR_TYPE(type,v,end);
+        MDAI_ATTR_TYPE(type,v,step);
+    } else {
+        type = na_object_type(type,v);
+    }
+    return type;
+}
+
+
+static na_mdai_t *
+na_mdai_alloc(VALUE ary)
+{
+    int i, n=4;
+    na_mdai_t *mdai;
+
+    mdai = ALLOC(na_mdai_t);
+    mdai->capa = n;
+    mdai->item = ALLOC_N( na_mdai_item_t, n );
+    for (i=0; i<n; i++) {
+        mdai->item[i].shape = 0;
+        mdai->item[i].val = Qnil;
+    }
+    mdai->item[0].val = ary;
+    mdai->type = NA_NONE;
+    mdai->na_type = Qnil;
+
+    return mdai;
+}
+
+static void
+na_mdai_realloc(na_mdai_t *mdai, int n_extra)
+{
+    int i, n;
+
+    i = mdai->capa;
+    mdai->capa += n_extra;
+    n = mdai->capa;
+    REALLOC_N( mdai->item, na_mdai_item_t, n );
+    for (; i<n; i++) {
+        mdai->item[i].shape = 0;
+        mdai->item[i].val = Qnil;
+    }
+}
+
+static void
+na_mdai_free(void *ptr)
+{
+    na_mdai_t *mdai = (na_mdai_t*)ptr;
+    xfree(mdai->item);
+    xfree(mdai);
+}
+
+
+/* investigate ndim, shape, type of Array */
+static int
+na_mdai_investigate(na_mdai_t *mdai, int ndim)
+{
+    ssize_t i;
+    int j;
+    size_t len, length;
+    double dbeg, dstep;
+    VALUE  v;
+    VALUE  val;
+
+    val = mdai->item[ndim-1].val;
+    len = RARRAY_LEN(val);
+
+    for (i=0; i < RARRAY_LEN(val); i++) {
+        v = RARRAY_AREF(val,i);
+
+        if (TYPE(v) == T_ARRAY) {
+            /* check recursive array */
+            for (j=0; j<ndim; j++) {
+                if (mdai->item[j].val == v)
+                    rb_raise(rb_eStandardError,
+                             "cannot convert from a recursive Array to NArray");
+            }
+            if ( ndim >= mdai->capa ) {
+                na_mdai_realloc(mdai,4);
+            }
+            mdai->item[ndim].val = v;
+            if ( na_mdai_investigate(mdai,ndim+1) ) {
+                len--; /* Array is empty */
+            }
+        }
+        else
+        if (rb_obj_is_kind_of(v, rb_cRange) || rb_obj_is_kind_of(v, na_cStep)) {
+            nary_step_sequence(v,&length,&dbeg,&dstep);
+            len += length-1;
+            mdai->type = na_mdai_object_type(mdai->type, v);
+        }
+        else if (IsNArray(v)) {
+            int r;
+            narray_t *na;
+            GetNArray(v,na);
+            if ( na->ndim == 0 ) {
+                len--; /* NArray is empty */
+            } else {
+                if ( ndim+na->ndim > mdai->capa ) {
+                    na_mdai_realloc(mdai,((na->ndim-1)/4+1)*4);
+                }
+                for ( j=0,r=ndim; j < na->ndim  ; j++,r++ ) {
+                    if ( mdai->item[r].shape < na->shape[j] )
+                        mdai->item[r].shape = na->shape[j];
+                }
+            }
+            // type
+            if (NIL_P(mdai->na_type)) {
+                mdai->na_type = CLASS_OF(v);
+            } else {
+                mdai->na_type = na_upcast(CLASS_OF(v), mdai->na_type);
+            }
+        } else {
+            mdai->type = na_mdai_object_type(mdai->type, v);
+        }
+    }
+
+    if (len==0) return 1; /* this array is empty */
+    if (mdai->item[ndim-1].shape < len) {
+        mdai->item[ndim-1].shape = len;
+    }
+    return 0;
+}
+
+
+static inline int
+na_mdai_ndim(na_mdai_t *mdai)
+{
+    int i;
+    // Dimension
+    for (i=0; i < mdai->capa && mdai->item[i].shape > 0; i++) ;
+    return i;
+}
+
+static inline void
+na_mdai_shape(na_mdai_t *mdai, int ndim, size_t *shape)
+{
+    int i;
+    for (i=0; i<ndim; i++) {
+        shape[i] = mdai->item[i].shape;
+    }
+}
+
+static VALUE
+na_mdai_dtype_numeric(int type)
+{
+    VALUE tp;
+    // DataType
+    switch(type) {
+    case NA_BIT:
+        tp = numo_cBit;
+        break;
+    case NA_INT32:
+        tp = numo_cInt32;
+        break;
+    case NA_INT64:
+        tp = numo_cInt64;
+        break;
+    case NA_DFLOAT:
+        tp = numo_cDFloat;
+        break;
+    case NA_DCOMPLEX:
+        tp = numo_cDComplex;
+        break;
+    case NA_ROBJ:
+        tp = numo_cRObject;
+        break;
+    default:
+        tp = Qnil;
+    }
+    return tp;
+}
+
+static VALUE
+na_mdai_dtype(na_mdai_t *mdai)
+{
+    VALUE tp;
+
+    tp = na_mdai_dtype_numeric(mdai->type);
+
+    if (!NIL_P(mdai->na_type)) {
+        if (NIL_P(tp)) {
+            tp = mdai->na_type;
+        } else {
+            tp = na_upcast(mdai->na_type,tp);
+        }
+    }
+    return tp;
+}
+
+
+static inline VALUE
+update_type(VALUE *ptype, VALUE dtype)
+{
+    if (ptype) {
+        if (*ptype == cNArray || !RTEST(*ptype)) {
+            *ptype = dtype;
+        } else {
+            dtype = *ptype;
+        }
+    }
+    return dtype;
+}
+
+static inline void
+check_subclass_of_narray(VALUE dtype)
+{
+    if (RTEST(rb_obj_is_kind_of(dtype, rb_cClass))) {
+        if (RTEST(rb_funcall(dtype, id_le, 1, cNArray))) {
+            return;
+        }
+    }
+    rb_raise(nary_eCastError, "cannot convert to NArray");
+}
+
+
+static size_t
+na_mdai_memsize(const void *ptr)
+{
+    const na_mdai_t *mdai = (const na_mdai_t*)ptr;
+
+    return sizeof(na_mdai_t) + mdai->capa * sizeof(na_mdai_item_t);
+}
+
+static const rb_data_type_t mdai_data_type = {
+    "Numo::NArray/mdai",
+    {NULL, na_mdai_free, na_mdai_memsize,},
+    0, 0, RUBY_TYPED_FREE_IMMEDIATELY|RUBY_TYPED_WB_PROTECTED
+};
+
+
+static void
+na_composition3_ary(VALUE ary, VALUE *ptype, VALUE *pshape, VALUE *pnary)
+{
+    VALUE vmdai;
+    na_mdai_t *mdai;
+    int i, ndim;
+    size_t *shape;
+    VALUE dtype, dshape;
+
+    mdai = na_mdai_alloc(ary);
+    vmdai = TypedData_Wrap_Struct(rb_cData, &mdai_data_type, (void*)mdai);
+    if ( na_mdai_investigate(mdai, 1) ) {
+        // empty
+        dtype = update_type(ptype, numo_cInt32);
+        if (pshape) {
+            *pshape = rb_ary_new3(1, INT2FIX(0));
+        }
+        if (pnary) {
+            check_subclass_of_narray(dtype);
+            shape = ALLOCA_N(size_t, 1);
+            shape[0] = 0;
+            *pnary = nary_new(dtype, 1, shape);
+        }
+    } else {
+        ndim = na_mdai_ndim(mdai);
+        shape = ALLOCA_N(size_t, ndim);
+        na_mdai_shape(mdai, ndim, shape);
+        dtype = update_type(ptype, na_mdai_dtype(mdai));
+        if (pshape) {
+            dshape = rb_ary_new2(ndim);
+            for (i=0; i<ndim; i++) {
+                rb_ary_push(dshape, SIZET2NUM(shape[i]));
+            }
+            *pshape = dshape;
+        }
+        if (pnary) {
+            check_subclass_of_narray(dtype);
+            *pnary = nary_new(dtype, ndim, shape);
+        }
+    }
+    RB_GC_GUARD(vmdai);
+}
+
+
+static void
+na_composition3(VALUE obj, VALUE *ptype, VALUE *pshape, VALUE *pnary)
+{
+    VALUE dtype, dshape;
+
+    if (TYPE(obj) == T_ARRAY) {
+        na_composition3_ary(obj, ptype, pshape, pnary);
+    }
+    else if (RTEST(rb_obj_is_kind_of(obj,rb_cNumeric))) {
+        dtype = na_mdai_dtype_numeric(na_mdai_object_type(NA_NONE, obj));
+        dtype = update_type(ptype, dtype);
+        if (pshape) {
+            *pshape = rb_ary_new();
+        }
+        if (pnary) {
+            check_subclass_of_narray(dtype);
+            *pnary = nary_new(dtype, 0, 0);
+        }
+    }
+    else if (IsNArray(obj)) {
+        int i, ndim;
+        narray_t *na;
+        GetNArray(obj,na);
+        ndim = na->ndim;
+        dtype = update_type(ptype, CLASS_OF(obj));
+        if (pshape) {
+            dshape = rb_ary_new2(ndim);
+            for (i=0; i<ndim; i++) {
+                rb_ary_push(dshape, SIZET2NUM(na->shape[i]));
+            }
+            *pshape = dshape;
+        }
+        if (pnary) {
+            *pnary = nary_new(dtype, ndim, na->shape);
+        }
+    } else {
+        rb_bug("invalid type for md-array: %s", rb_class2name(CLASS_OF(obj)));
+    }
+}
+
+
+static VALUE
+na_s_array_shape(VALUE mod, VALUE ary)
+{
+    VALUE shape;
+
+    if (TYPE(ary) != T_ARRAY) {
+        // 0-dimension
+        return rb_ary_new();
+    }
+    na_composition3(ary, 0, &shape, 0);
+    return shape;
+}
+
+
+/*
+  Generate new unallocated NArray instance with shape and type defined from obj.
+  Numo::NArray.new_like(obj) returns instance whose type is defined from obj.
+  Numo::DFloat.new_like(obj) returns DFloat instance.
+
+  @overload new_like(obj)
+  @param [Numeric,Array,Numo::NArray] obj
+  @return [Numo::NArray]
+  @example
+    Numo::NArray.new_like([[1,2,3],[4,5,6]])
+    => Numo::Int32#shape=[2,3](empty)
+    Numo::DFloat.new_like([[1,2],[3,4]])
+    => Numo::DFloat#shape=[2,2](empty)
+    Numo::NArray.new_like([1,2i,3])
+    => Numo::DComplex#shape=[3](empty)
+*/
+VALUE
+na_s_new_like(VALUE type, VALUE obj)
+{
+    VALUE newary;
+
+    na_composition3(obj, &type, 0, &newary);
+    return newary;
+}
+
+
+VALUE
+na_ary_composition_dtype(VALUE ary)
+{
+    VALUE type = Qnil;
+
+    na_composition3(ary, &type, 0, 0);
+    return type;
+}
+
+static VALUE
+na_s_array_type(VALUE mod, VALUE ary)
+{
+    return na_ary_composition_dtype(ary);
+}
+
+
+/*
+  Generate NArray object. NArray datatype is automatically selected.
+  @overload [](elements)
+  @param [Numeric,Array] elements
+  @return [NArray]
+*/
+static VALUE
+nary_s_bracket(VALUE klass, VALUE ary)
+{
+    VALUE dtype=Qnil;
+
+    if (TYPE(ary)!=T_ARRAY) {
+        rb_bug("Argument is not array");
+    }
+    dtype = na_ary_composition_dtype(ary);
+    check_subclass_of_narray(dtype);
+    return rb_funcall(dtype, id_cast, 1, ary);
+}
+
+
+//VALUE
+//nst_check_compatibility(VALUE self, VALUE ary);
+
+
+/* investigate ndim, shape, type of Array */
+/*
+static int
+na_mdai_for_struct(na_mdai_t *mdai, int ndim)
+{
+    size_t i;
+    int j, r;
+    size_t len;
+    VALUE  v;
+    VALUE  val;
+    narray_t *na;
+
+    //fprintf(stderr,"ndim=%d\n",ndim);    rb_p(mdai->na_type);
+    if (ndim>4) { abort(); }
+    val = mdai->item[ndim].val;
+
+    //fpintf(stderr,"val = ");    rb_p(val);
+
+    if (CLASS_OF(val) == mdai->na_type) {
+        GetNArray(val,na);
+        if ( ndim+na->ndim > mdai->capa ) {
+            abort();
+            na_mdai_realloc(mdai,((na->ndim-1)/4+1)*4);
+        }
+        for ( j=0,r=ndim; j < na->ndim; j++,r++ ) {
+            if ( mdai->item[r].shape < na->shape[j] )
+                mdai->item[r].shape = na->shape[j];
+        }
+        return 1;
+    }
+
+    if (TYPE(val) == T_ARRAY) {
+        // check recursive array
+        for (j=0; j<ndim-1; j++) {
+            if (mdai->item[j].val == val)
+                rb_raise(rb_eStandardError,
+                         "cannot convert from a recursive Array to NArray");
+        }
+        //fprintf(stderr,"check:");        rb_p(val);
+        // val is a Struct recort
+        if (RTEST( nst_check_compatibility(mdai->na_type, val) )) {
+            //fputs("compati\n",stderr);
+            return 1;
+        }
+        // otherwise, multi-dimention
+        if (ndim >= mdai->capa) {
+            //fprintf(stderr,"exeed capa\n");            abort();
+            na_mdai_realloc(mdai,4);
+        }
+        // finally, multidimension-check
+        len = RARRAY_LEN(val);
+        for (i=0; i < len; i++) {
+            v = RARRAY_AREF(val,i);
+            if (TYPE(v) != T_ARRAY) {
+                //abort();
+                return 0;
+            }
+        }
+        for (i=0; i < len; i++) {
+            v = RARRAY_AREF(val,i);
+            //fprintf(stderr,"check:");            rb_p(v);
+            mdai->item[ndim+1].val = v;
+            if ( na_mdai_for_struct( mdai, ndim+1 ) == 0 ) {
+                //fprintf(stderr,"not struct:");                rb_p(v);
+                //abort();
+                return 0;
+            }
+        }
+        if (mdai->item[ndim].shape < len) {
+            mdai->item[ndim].shape = len;
+        }
+        return 1;
+    }
+
+    //fprintf(stderr,"invalid for struct:");    rb_p(val);    abort();
+    return 0;
+}
+*/
+
+
+/*
+VALUE
+na_ary_composition_for_struct(VALUE nstruct, VALUE ary)
+{
+    volatile VALUE vmdai, vnc;
+    na_mdai_t *mdai;
+    na_compose_t *nc;
+
+    mdai = na_mdai_alloc(ary);
+    mdai->na_type = nstruct;
+    vmdai = TypedData_Wrap_Struct(rb_cData, &mdai_data_type, (void*)mdai);
+    na_mdai_for_struct(mdai, 0);
+    nc = na_compose_alloc();
+    vnc = WrapCompose(nc);
+    na_mdai_result(mdai, nc);
+    //fprintf(stderr,"nc->ndim=%d\n",nc->ndim);
+    rb_gc_force_recycle(vmdai);
+    return vnc;
+}
+*/
+
+
+
+void
+Init_nary_array()
+{
+    rb_define_singleton_method(cNArray, "array_shape", na_s_array_shape, 1);
+    rb_define_singleton_method(cNArray, "array_type", na_s_array_type, 1);
+    rb_define_singleton_method(cNArray, "new_like", na_s_new_like, 1);
+
+    rb_define_singleton_method(cNArray, "[]", nary_s_bracket, -2);
+
+    id_begin   = rb_intern("begin");
+    id_end     = rb_intern("end");
+    id_step    = rb_intern("step");
+    id_cast    = rb_intern("cast");
+    id_abs     = rb_intern("abs");
+    id_le      = rb_intern("<=");
+    id_Complex = rb_intern("Complex");
+}
diff --git a/ext/numo/narray/data.c b/ext/numo/narray/data.c
new file mode 100644
index 0000000..7a70a6a
--- /dev/null
+++ b/ext/numo/narray/data.c
@@ -0,0 +1,966 @@
+/*
+  data.c
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+
+#include <ruby.h>
+#include "numo/narray.h"
+#include "numo/template.h"
+
+static VALUE sym_mulsum;
+static ID id_mulsum;
+static ID id_respond_to_p;
+static ID id_store;
+static ID id_swap_byte;
+
+// ---------------------------------------------------------------------
+
+#define LOOP_UNARY_PTR(lp,proc)                    \
+{                                                  \
+    size_t  i;                                     \
+    ssize_t s1, s2;                                \
+    char   *p1, *p2;                               \
+    size_t *idx1, *idx2;                           \
+    INIT_COUNTER(lp, i);                           \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);             \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);             \
+    if (idx1) {                                    \
+        if (idx2) {                                \
+            for (; i--;) {                         \
+                proc((p1+*idx1), (p2+*idx2));      \
+                idx1++;                            \
+                idx2++;                            \
+            }                                      \
+        } else {                                   \
+            for (; i--;) {                         \
+                proc((p1+*idx1), p2);              \
+                idx1++;                            \
+                p2 += s2;                          \
+            }                                      \
+        }                                          \
+    } else {                                       \
+        if (idx2) {                                \
+            for (; i--;) {                         \
+                proc(p1, (p1+*idx2));              \
+                p1 += s1;                          \
+                idx2++;                            \
+            }                                      \
+        } else {                                   \
+            for (; i--;) {                         \
+                proc(p1, p2);                      \
+                p1 += s1;                          \
+                p2 += s2;                          \
+            }                                      \
+        }                                          \
+    }                                              \
+}
+
+#define m_memcpy(src,dst) memcpy(dst,src,e)
+void
+iter_copy_bytes(na_loop_t *const lp)
+{
+    size_t e;
+    e = lp->args[0].elmsz;
+    LOOP_UNARY_PTR(lp,m_memcpy);
+}
+
+VALUE
+na_copy(VALUE self)
+{
+    VALUE v;
+    ndfunc_arg_in_t ain[1] = {{Qnil,0}};
+    ndfunc_arg_out_t aout[1] = {{INT2FIX(0),0}};
+    ndfunc_t ndf = { iter_copy_bytes, FULL_LOOP, 1, 1, ain, aout };
+
+    v = na_ndloop(&ndf, 1, self);
+    return v;
+}
+
+VALUE
+na_store(VALUE self, VALUE src)
+{
+    return rb_funcall(self,id_store,1,src);
+}
+
+// ---------------------------------------------------------------------
+
+#define m_swap_byte(q1,q2)       \
+    {                            \
+        size_t j;                \
+        memcpy(b1,q1,e);         \
+        for (j=0; j<e; j++) {    \
+            b2[e-1-j] = b1[j];   \
+        }                        \
+        memcpy(q2,b2,e);         \
+    }
+
+static void
+iter_swap_byte(na_loop_t *const lp)
+{
+    char   *b1, *b2;
+    size_t  e;
+
+    e = lp->args[0].elmsz;
+    b1 = ALLOCA_N(char, e);
+    b2 = ALLOCA_N(char, e);
+    LOOP_UNARY_PTR(lp,m_swap_byte);
+}
+
+static VALUE
+nary_swap_byte(VALUE self)
+{
+    VALUE v;
+    ndfunc_arg_in_t ain[1] = {{Qnil,0}};
+    ndfunc_arg_out_t aout[1] = {{INT2FIX(0),0}};
+    ndfunc_t ndf = { iter_swap_byte, FULL_LOOP|NDF_ACCEPT_BYTESWAP,
+                     1, 1, ain, aout };
+
+    v = na_ndloop(&ndf, 1, self);
+    if (self!=v) {
+        na_copy_flags(self, v);
+    }
+    REVERSE_ENDIAN(v);
+    return v;
+}
+
+
+static VALUE
+nary_to_network(VALUE self)
+{
+    if (TEST_BIG_ENDIAN(self)) {
+        return self;
+    }
+    return rb_funcall(self, id_swap_byte, 0);
+}
+
+static VALUE
+nary_to_vacs(VALUE self)
+{
+    if (TEST_LITTLE_ENDIAN(self)) {
+        return self;
+    }
+    return rb_funcall(self, id_swap_byte, 0);
+}
+
+static VALUE
+nary_to_host(VALUE self)
+{
+    if (TEST_HOST_ORDER(self)) {
+        return self;
+    }
+    return rb_funcall(self, id_swap_byte, 0);
+}
+
+static VALUE
+nary_to_swapped(VALUE self)
+{
+    if (TEST_BYTE_SWAPPED(self)) {
+        return self;
+    }
+    return rb_funcall(self, id_swap_byte, 0);
+}
+
+
+//----------------------------------------------------------------------
+
+static inline int
+check_axis(int axis, int ndim)
+{
+    if (axis < -ndim || axis >= ndim) {
+        rb_raise(nary_eDimensionError,"invalid axis (%d for %d-dimension)",
+                 axis, ndim);
+    }
+    if (axis < 0) {
+        axis += ndim;
+    }
+    return axis;
+}
+
+/*
+  Interchange two axes.
+  @overload  swapaxes(axis1,axis2)
+  @param [Integer] axis1
+  @param [Integer] axis2
+  @return [Numo::NArray]  view of NArray.
+  @example
+    x = Numo::Int32[[1,2,3]]
+
+    p x.swapaxes(0,1)
+    # Numo::Int32(view)#shape=[3,1]
+    # [[1],
+    #  [2],
+    #  [3]]
+
+    p x = Numo::Int32[[[0,1],[2,3]],[[4,5],[6,7]]]
+    # Numo::Int32#shape=[2,2,2]
+    # [[[0, 1],
+    #   [2, 3]],
+    #  [[4, 5],
+    #   [6, 7]]]
+
+    p x.swapaxes(0,2)
+    # Numo::Int32(view)#shape=[2,2,2]
+    # [[[0, 4],
+    #   [2, 6]],
+    #  [[1, 5],
+    #   [3, 7]]]
+*/
+VALUE
+na_swapaxes(VALUE self, VALUE a1, VALUE a2)
+{
+    int  i, j, ndim;
+    size_t tmp_shape;
+    stridx_t tmp_stridx;
+    narray_view_t *na;
+    volatile VALUE view;
+
+    view = na_make_view(self);
+    GetNArrayView(view,na);
+
+    ndim = na->base.ndim;
+    i = check_axis(NUM2INT(a1), ndim);
+    j = check_axis(NUM2INT(a2), ndim);
+
+    tmp_shape = na->base.shape[i];
+    tmp_stridx = na->stridx[i];
+    na->base.shape[i] = na->base.shape[j];
+    na->stridx[i] = na->stridx[j];
+    na->base.shape[j] = tmp_shape;
+    na->stridx[j] = tmp_stridx;
+
+    return view;
+}
+
+VALUE
+na_transpose_map(VALUE self, int *map)
+{
+    int  i, ndim;
+    size_t *shape;
+    stridx_t *stridx;
+    narray_view_t *na;
+    volatile VALUE view;
+
+    view = na_make_view(self);
+    GetNArrayView(view,na);
+
+    ndim = na->base.ndim;
+    shape = ALLOCA_N(size_t,ndim);
+    stridx = ALLOCA_N(stridx_t,ndim);
+
+    for (i=0; i<ndim; i++) {
+	shape[i] = na->base.shape[i];
+	stridx[i] = na->stridx[i];
+    }
+    for (i=0; i<ndim; i++) {
+	na->base.shape[i] = shape[map[i]];
+	na->stridx[i] = stridx[map[i]];
+    }
+    return view;
+}
+
+
+#define SWAP(a,b,tmp) {tmp=a;a=b;b=tmp;}
+
+VALUE
+na_transpose(int argc, VALUE *argv, VALUE self)
+{
+    int ndim, *map, *permute;
+    int i, d;
+    bool is_positive, is_negative;
+    narray_t *na1;
+
+    GetNArray(self,na1);
+    ndim = na1->ndim;
+    if (ndim < 2) {
+        if (argc > 0) {
+            rb_raise(rb_eArgError, "unnecessary argument for 1-d array");
+        }
+        return na_make_view(self);
+    }
+    map = ALLOCA_N(int,ndim);
+    if (argc == 0) {
+        for (i=0; i < ndim; i++) {
+            map[i] = ndim-1-i;
+        }
+        return na_transpose_map(self,map);
+    }
+    // with argument
+    if (argc > ndim) {
+        rb_raise(rb_eArgError, "more arguments than ndim");
+    }
+    for (i=0; i < ndim; i++) {
+        map[i] = i;
+    }
+    permute = ALLOCA_N(int,argc);
+    for (i=0; i < argc; i++) {
+        permute[i] = 0;
+    }
+    is_positive = is_negative = 0;
+    for (i=0; i < argc; i++) {
+	if (TYPE(argv[i]) != T_FIXNUM) {
+            rb_raise(rb_eArgError, "invalid argument");
+        }
+        d = FIX2INT(argv[i]);
+        if (d >= 0) {
+            if (d >= argc) {
+                rb_raise(rb_eArgError, "out of dimension range");
+            }
+            if (is_negative) {
+                rb_raise(rb_eArgError, "dimension must be non-negative only or negative only");
+            }
+            if (permute[d]) {
+                rb_raise(rb_eArgError, "not permutation");
+            }
+            map[i] = d;
+            permute[d] = 1;
+            is_positive = 1;
+        } else {
+            if (d < -argc) {
+                rb_raise(rb_eArgError, "out of dimension range");
+            }
+            if (is_positive) {
+                rb_raise(rb_eArgError, "dimension must be non-negative only or negative only");
+            }
+            if (permute[argc+d]) {
+                rb_raise(rb_eArgError, "not permutation");
+            }
+            map[ndim-argc+i] = ndim+d;
+            permute[argc+d] = 1;
+            is_negative = 1;
+        }
+    }
+    return na_transpose_map(self,map);
+}
+
+//----------------------------------------------------------------------
+
+static void
+na_check_reshape(int argc, VALUE *argv, VALUE self, size_t *shape)
+{
+    int    i, unfixed=-1;
+    size_t total=1;
+    narray_t *na;
+
+    if (argc == 0) {
+        rb_raise(rb_eArgError, "No argrument");
+    }
+    GetNArray(self,na);
+    if (NA_SIZE(na) == 0) {
+        rb_raise(rb_eRuntimeError, "cannot reshape empty array");
+    }
+
+    /* get shape from argument */
+    for (i=0; i<argc; ++i) {
+        switch(TYPE(argv[i])) {
+        case T_FIXNUM:
+            total *= shape[i] = NUM2INT(argv[i]);
+            break;
+        case T_NIL:
+        case T_TRUE:
+            if (unfixed >= 0) {
+                rb_raise(rb_eArgError,"multiple unfixed dimension");
+            }
+            unfixed = i;
+            break;
+        default:
+            rb_raise(rb_eArgError,"illegal type");
+        }
+    }
+
+    if (unfixed>=0) {
+        if (NA_SIZE(na) % total != 0) {
+            rb_raise(rb_eArgError, "Total size size must be divisor");
+        }
+        shape[unfixed] = NA_SIZE(na) / total;
+    }
+    else if (total !=  NA_SIZE(na)) {
+        rb_raise(rb_eArgError, "Total size must be same");
+    }
+}
+
+/*
+  Change the shape of self NArray without coping.
+  Raise exception if self is non-contiguous.
+
+  @overload  reshape!(size0,size1,...)
+  @param sizeN [Integer] new shape
+  @return [Numo::NArray] return self.
+  @example
+*/
+static VALUE
+na_reshape_bang(int argc, VALUE *argv, VALUE self)
+{
+    size_t *shape;
+    narray_t *na;
+
+    if (na_check_contiguous(self)==Qfalse) {
+        rb_raise(rb_eStandardError, "cannot change shape of non-contiguous NArray");
+    }
+    shape = ALLOCA_N(size_t, argc);
+    na_check_reshape(argc, argv, self, shape);
+
+    GetNArray(self, na);
+    na_setup_shape(na, argc, shape);
+    return self;
+}
+
+/*
+  Copy and change the shape of NArray.
+  Returns a copied NArray.
+
+  @overload  reshape(size0,size1,...)
+  @param sizeN [Integer] new shape
+  @return [Numo::NArray] return self.
+  @example
+*/
+static VALUE
+na_reshape(int argc, VALUE *argv, VALUE self)
+{
+    size_t *shape;
+    narray_t *na;
+    VALUE    copy;
+
+    shape = ALLOCA_N(size_t, argc);
+    na_check_reshape(argc, argv, self, shape);
+
+    copy = rb_funcall(self, rb_intern("dup"), 0);
+    GetNArray(copy, na);
+    na_setup_shape(na, argc, shape);
+    return copy;
+}
+
+//----------------------------------------------------------------------
+
+VALUE
+na_flatten_dim(VALUE self, int sd)
+{
+    int i, nd, fd;
+    size_t j;
+    size_t *c, *pos, *idx1, *idx2;
+    size_t stride;
+    size_t  *shape, size;
+    stridx_t sdx;
+    narray_t *na;
+    narray_view_t *na1, *na2;
+    volatile VALUE view;
+
+    GetNArray(self,na);
+    nd = na->ndim;
+
+    if (nd==0) {
+        return na_make_view(self);
+    }
+    if (sd<0 || sd>=nd) {
+        rb_bug("na_flaten_dim: start_dim (%d) out of range",sd);
+    }
+
+    // new shape
+    shape = ALLOCA_N(size_t,sd+1);
+    for (i=0; i<sd; i++) {
+        shape[i] = na->shape[i];
+    }
+    size = 1;
+    for (i=sd; i<nd; i++) {
+        size *= na->shape[i];
+    }
+    shape[sd] = size;
+
+    // new object
+    view = na_s_allocate_view(CLASS_OF(self));
+    na_copy_flags(self, view);
+    GetNArrayView(view, na2);
+
+    // new stride
+    na_setup_shape((narray_t*)na2, sd+1, shape);
+    na2->stridx = ALLOC_N(stridx_t,sd+1);
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        stride = nary_element_stride(self);
+        for (i=sd+1; i--; ) {
+            //printf("data: i=%d shpae[i]=%ld stride=%ld\n",i,shape[i],stride);
+            SDX_SET_STRIDE(na2->stridx[i],stride);
+            stride *= shape[i];
+        }
+        na2->offset = 0;
+        na2->data = self;
+        break;
+    case NARRAY_VIEW_T:
+        GetNArrayView(self, na1);
+        na2->data = na1->data;
+        na2->offset = na1->offset;
+        for (i=0; i<sd; i++) {
+            if (SDX_IS_INDEX(na1->stridx[i])) {
+                idx1 = SDX_GET_INDEX(na1->stridx[i]);
+                idx2 = ALLOC_N(size_t, shape[i]);
+                for (j=0; j<shape[i]; j++) {
+                    idx2[j] = idx1[j];
+                }
+                SDX_SET_INDEX(na2->stridx[i],idx2);
+            } else {
+                na2->stridx[i] = na1->stridx[i];
+                //printf("view: i=%d stridx=%d\n",i,SDX_GET_STRIDE(sdx));
+            }
+        }
+        // flat dimenion == last dimension
+        if (RTEST(na_check_ladder(self,sd))) {
+        //if (0) {
+            na2->stridx[sd] = na1->stridx[nd-1];
+        } else {
+            // set index
+            idx2 = ALLOC_N(size_t, shape[sd]);
+            SDX_SET_INDEX(na2->stridx[sd],idx2);
+            // init for md-loop
+            fd = nd-sd;
+            c = ALLOC_N(size_t, fd);
+            for (i=0; i<fd; i++) c[i]=0;
+            pos = ALLOC_N(size_t, fd+1);
+            pos[0] = 0;
+            // md-loop
+            for (i=j=0;;) {
+                for (; i<fd; i++) {
+                    sdx = na1->stridx[i+sd];
+                    if (SDX_IS_INDEX(sdx)) {
+                        pos[i+1] = pos[i] + SDX_GET_INDEX(sdx)[c[i]];
+                    } else {
+                        pos[i+1] = pos[i] + SDX_GET_STRIDE(sdx)*c[i];
+                    }
+                }
+                idx2[j++] = pos[i];
+                for (;;) {
+                    if (i==0) goto loop_end;
+                    i--;
+                    c[i]++;
+                    if (c[i] < na1->base.shape[i+sd]) break;
+                    c[i] = 0;
+                }
+            }
+        loop_end:
+            xfree(pos);
+            xfree(c);
+        }
+        break;
+    }
+    return view;
+}
+
+VALUE
+na_flatten(VALUE self)
+{
+    return na_flatten_dim(self,0);
+}
+
+//----------------------------------------------------------------------
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+/*
+  Returns a diagonal view of NArray
+  @overload  diagonal([offset,axes])
+  @param [Integer] offset  Diagonal offset from the main diagonal.
+    The default is 0. k>0 for diagonals above the main diagonal,
+    and k<0 for diagonals below the main diagonal.
+  @param [Array] axes  Array of axes to be used as the 2-d sub-arrays
+    from which the diagonals should be taken. Defaults to last-two
+    axes ([-2,-1]).
+  @return [Numo::NArray]  diagonal view of NArray.
+  @example
+    a = Numo::DFloat.new(4,5).seq
+    => Numo::DFloat#shape=[4,5]
+    [[0, 1, 2, 3, 4],
+     [5, 6, 7, 8, 9],
+     [10, 11, 12, 13, 14],
+     [15, 16, 17, 18, 19]]
+    b = a.diagonal(1)
+    => Numo::DFloat(view)#shape=[4]
+    [1, 7, 13, 19]
+    b.store(0)
+    a
+    => Numo::DFloat#shape=[4,5]
+    [[0, 0, 2, 3, 4],
+     [5, 6, 0, 8, 9],
+     [10, 11, 12, 0, 14],
+     [15, 16, 17, 18, 0]]
+    b.store([1,2,3,4])
+    a
+    => Numo::DFloat#shape=[4,5]
+    [[0, 1, 2, 3, 4],
+     [5, 6, 2, 8, 9],
+     [10, 11, 12, 3, 14],
+     [15, 16, 17, 18, 4]]
+ */
+VALUE
+na_diagonal(int argc, VALUE *argv, VALUE self)
+{
+    int  i, k, nd;
+    size_t  j;
+    size_t *idx0, *idx1, *diag_idx;
+    size_t *shape;
+    size_t  diag_size;
+    ssize_t stride, stride0, stride1;
+    narray_t *na;
+    narray_view_t *na1, *na2;
+    VALUE view;
+    VALUE vofs=0, vaxes=0;
+    ssize_t kofs;
+    size_t k0, k1;
+    int ax[2];
+
+    // check arguments
+    if (argc>2) {
+        rb_raise(rb_eArgError,"too many arguments (%d for 0..2)",argc);
+    }
+
+    for (i=0; i<argc; i++) {
+        switch(TYPE(argv[i])) {
+        case T_FIXNUM:
+            if (vofs) {
+                rb_raise(rb_eArgError,"offset is given twice");
+            }
+            vofs = argv[i];
+            break;
+        case T_ARRAY:
+            if (vaxes) {
+                rb_raise(rb_eArgError,"axes-array is given twice");
+            }
+            vaxes = argv[i];
+            break;
+        }
+    }
+
+    if (vofs) {
+        kofs = NUM2SSIZET(vofs);
+    } else {
+        kofs = 0;
+    }
+
+    GetNArray(self,na);
+    nd = na->ndim;
+    if (nd < 2) {
+        rb_raise(nary_eDimensionError,"less than 2-d array");
+    }
+
+    if (vaxes) {
+        if (RARRAY_LEN(vaxes) != 2) {
+            rb_raise(rb_eArgError,"axes must be 2-element array");
+        }
+        ax[0] = NUM2INT(RARRAY_AREF(vaxes,0));
+        ax[1] = NUM2INT(RARRAY_AREF(vaxes,1));
+        if (ax[0]<-nd || ax[0]>=nd || ax[1]<-nd || ax[1]>=nd) {
+            rb_raise(rb_eArgError,"axis out of range:[%d,%d]",ax[0],ax[1]);
+        }
+        if (ax[0]<0) {ax[0] += nd;}
+        if (ax[1]<0) {ax[1] += nd;}
+        if (ax[0]==ax[1]) {
+            rb_raise(rb_eArgError,"same axes:[%d,%d]",ax[0],ax[1]);
+        }
+    } else {
+        ax[0] = nd-2;
+        ax[1] = nd-1;
+    }
+
+    // Diagonal offset from the main diagonal.
+    if (kofs >= 0) {
+        k0 = 0;
+        k1 = kofs;
+        if (k1 >= na->shape[ax[1]]) {
+            rb_raise(rb_eArgError,"invalid diagonal offset(%"SZF"d) for "
+                     "last dimension size(%"SZF"d)",kofs,na->shape[ax[1]]);
+        }
+    } else {
+        k0 = -kofs;
+        k1 = 0;
+        if (k0 >= na->shape[ax[0]]) {
+            rb_raise(rb_eArgError,"invalid diagonal offset(=%"SZF"d) for "
+                     "last-1 dimension size(%"SZF"d)",kofs,na->shape[ax[0]]);
+        }
+    }
+
+    diag_size = MIN(na->shape[ax[0]]-k0,na->shape[ax[1]]-k1);
+
+    // new shape
+    shape = ALLOCA_N(size_t,nd-1);
+    for (i=k=0; i<nd; i++) {
+        if (i != ax[0] && i != ax[1]) {
+            shape[k++] = na->shape[i];
+        }
+    }
+    shape[k] = diag_size;
+
+    // new object
+    view = na_s_allocate_view(CLASS_OF(self));
+    na_copy_flags(self, view);
+    GetNArrayView(view, na2);
+
+    // new stride
+    na_setup_shape((narray_t*)na2, nd-1, shape);
+    na2->stridx = ALLOC_N(stridx_t, nd-1);
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        na2->offset = 0;
+        na2->data = self;
+        stride = stride0 = stride1 = nary_element_stride(self);
+        for (i=nd,k=nd-2; i--; ) {
+            if (i==ax[1]) {
+                stride1 = stride;
+                if (kofs > 0) {
+                    na2->offset = kofs*stride;
+                }
+            } else if (i==ax[0]) {
+                stride0 = stride;
+                if (kofs < 0) {
+                    na2->offset = (-kofs)*stride;
+                }
+            } else {
+                SDX_SET_STRIDE(na2->stridx[--k],stride);
+            }
+            stride *= na->shape[i];
+        }
+        SDX_SET_STRIDE(na2->stridx[nd-2],stride0+stride1);
+        break;
+
+    case NARRAY_VIEW_T:
+        GetNArrayView(self, na1);
+        na2->data = na1->data;
+        na2->offset = na1->offset;
+        for (i=k=0; i<nd; i++) {
+            if (i != ax[0] && i != ax[1]) {
+                if (SDX_IS_INDEX(na1->stridx[i])) {
+                    idx0 = SDX_GET_INDEX(na1->stridx[i]);
+                    idx1 = ALLOC_N(size_t, na->shape[i]);
+                    for (j=0; j<na->shape[i]; j++) {
+                        idx1[j] = idx0[j];
+                    }
+                    SDX_SET_INDEX(na2->stridx[k],idx1);
+                } else {
+                    na2->stridx[k] = na1->stridx[i];
+                }
+                k++;
+            }
+        }
+        if (SDX_IS_INDEX(na1->stridx[ax[0]])) {
+            idx0 = SDX_GET_INDEX(na1->stridx[ax[0]]);
+            diag_idx = ALLOC_N(size_t, diag_size);
+            if (SDX_IS_INDEX(na1->stridx[ax[1]])) {
+                idx1 = SDX_GET_INDEX(na1->stridx[ax[1]]);
+                for (j=0; j<diag_size; j++) {
+                    diag_idx[j] = idx0[j+k0] + idx1[j+k1];
+                }
+            } else {
+                stride1 = SDX_GET_STRIDE(na1->stridx[ax[1]]);
+                for (j=0; j<diag_size; j++) {
+                    diag_idx[j] = idx0[j+k0] + stride1*(j+k1);
+                }
+            }
+            SDX_SET_INDEX(na2->stridx[nd-2],diag_idx);
+        } else {
+            stride0 = SDX_GET_STRIDE(na1->stridx[ax[0]]);
+            if (SDX_IS_INDEX(na1->stridx[ax[1]])) {
+                idx1 = SDX_GET_INDEX(na1->stridx[ax[1]]);
+                diag_idx = ALLOC_N(size_t, diag_size);
+                for (j=0; j<diag_size; j++) {
+                    diag_idx[j] = stride0*(j+k0) + idx1[j+k1];
+                }
+                SDX_SET_INDEX(na2->stridx[nd-2],diag_idx);
+            } else {
+                stride1 = SDX_GET_STRIDE(na1->stridx[ax[1]]);
+                na2->offset += stride0*k0 + stride1*k1;
+                SDX_SET_STRIDE(na2->stridx[nd-2],stride0+stride1);
+            }
+        }
+        break;
+    }
+    return view;
+}
+
+//----------------------------------------------------------------------
+
+
+#if 0
+#ifdef SWAP
+#undef SWAP
+#endif
+#define SWAP(a,b,t) {t=a;a=b;b=t;}
+
+static VALUE
+na_new_dimension_for_dot(VALUE self, int pos, int len, bool transpose)
+{
+    int i, k, l, nd;
+    size_t  j;
+    size_t *idx1, *idx2;
+    size_t *shape;
+    ssize_t stride;
+    narray_t *na;
+    narray_view_t *na1, *na2;
+    size_t shape_n;
+    stridx_t stridx_n;
+    volatile VALUE view;
+
+    GetNArray(self,na);
+    nd = na->ndim;
+
+    view = na_s_allocate_view(CLASS_OF(self));
+
+    na_copy_flags(self, view);
+    GetNArrayView(view, na2);
+
+    // new dimension
+    if (pos < 0) pos += nd;
+    if (pos > nd || pos < 0) {
+        rb_raise(rb_eRangeError,"new dimension is out of range");
+    }
+    nd += len;
+    shape = ALLOCA_N(size_t,nd);
+    na2->stridx = ALLOC_N(stridx_t,nd);
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        i = k = 0;
+        while (i < nd) {
+            if (i == pos && len > 0) {
+                for (l=0; l<len; l++) {
+                    shape[i++] = 1;
+                }
+            } else {
+                shape[i++] = na->shape[k++];
+            }
+        }
+        na_setup_shape((narray_t*)na2, nd, shape);
+        stride = nary_element_stride(self);
+        for (i=nd; i--;) {
+            SDX_SET_STRIDE(na2->stridx[i], stride);
+            stride *= shape[i];
+        }
+        na2->offset = 0;
+        na2->data = self;
+        break;
+    case NARRAY_VIEW_T:
+        GetNArrayView(self, na1);
+        i = k = 0;
+        while (i < nd) {
+            if (i == pos && len > 0) {
+                if (SDX_IS_INDEX(na1->stridx[k])) {
+                    stride = SDX_GET_INDEX(na1->stridx[k])[0];
+                } else {
+                    stride = SDX_GET_STRIDE(na1->stridx[k]);
+                }
+                for (l=0; l<len; l++) {
+                    shape[i] = 1;
+                    SDX_SET_STRIDE(na2->stridx[i], stride);
+                    i++;
+                }
+            } else {
+                shape[i] = na1->base.shape[k];
+                if (SDX_IS_INDEX(na1->stridx[k])) {
+                    idx1 = SDX_GET_INDEX(na1->stridx[k]);
+                    idx2 = ALLOC_N(size_t,na1->base.shape[k]);
+                    for (j=0; j<na1->base.shape[k]; j++) {
+                        idx2[j] = idx1[j];
+                    }
+                    SDX_SET_INDEX(na2->stridx[i], idx2);
+                } else {
+                    na2->stridx[i] = na1->stridx[k];
+                }
+                i++; k++;
+            }
+        }
+        na_setup_shape((narray_t*)na2, nd, shape);
+        na2->offset = na1->offset;
+        na2->data = na1->data;
+        break;
+    }
+
+    if (transpose) {
+	SWAP(na2->base.shape[nd-1], na2->base.shape[nd-2], shape_n);
+	SWAP(na2->stridx[nd-1], na2->stridx[nd-2], stridx_n);
+    }
+
+    return view;
+}
+
+
+//----------------------------------------------------------------------
+
+/*
+ *  call-seq:
+ *     narray.dot(other) => narray
+ *
+ *  Returns dot product.
+ *
+ */
+
+static VALUE
+numo_na_dot(VALUE self, VALUE other)
+{
+    VALUE test;
+    volatile VALUE a1=self, a2=other;
+    narray_t *na1, *na2;
+
+    test = rb_funcall(a1, id_respond_to_p, 1, sym_mulsum);
+    if (!RTEST(test)) {
+        rb_raise(rb_eNoMethodError,"requires mulsum method for dot method");
+    }
+    GetNArray(a1,na1);
+    GetNArray(a2,na2);
+    if (na1->ndim==0 || na2->ndim==0) {
+        rb_raise(nary_eDimensionError,"zero dimensional narray");
+    }
+    if (na2->ndim > 1) {
+        if (na1->shape[na1->ndim-1] != na2->shape[na2->ndim-2]) {
+            rb_raise(nary_eShapeError,"shape mismatch: self.shape[-1](=%"SZF"d) != other.shape[-2](=%"SZF"d)",
+                     na1->shape[na1->ndim-1], na2->shape[na2->ndim-2]);
+        }
+        // insert new axis [ ..., last-1-dim, newaxis*other.ndim, last-dim ]
+        a1 = na_new_dimension_for_dot(a1, na1->ndim-1, na2->ndim-1, 0);
+        // insert & transpose [ newaxis*self.ndim, ..., last-dim, last-1-dim ]
+        a2 = na_new_dimension_for_dot(a2, 0, na1->ndim-1, 1);
+    }
+    return rb_funcall(a1,id_mulsum,2,a2,INT2FIX(-1));
+}
+#endif
+
+void
+Init_nary_data()
+{
+    rb_define_method(cNArray, "copy", na_copy, 0); // deprecated
+
+    rb_define_method(cNArray, "flatten", na_flatten, 0);
+    rb_define_method(cNArray, "swapaxes", na_swapaxes, 2);
+    rb_define_method(cNArray, "transpose", na_transpose, -1);
+
+    rb_define_method(cNArray, "reshape", na_reshape,-1);
+    rb_define_method(cNArray, "reshape!", na_reshape_bang,-1);
+    /*
+    rb_define_alias(cNArray,  "shape=","reshape!");
+    */
+    rb_define_method(cNArray, "diagonal", na_diagonal,-1);
+
+    rb_define_method(cNArray, "swap_byte", nary_swap_byte, 0);
+#ifdef DYNAMIC_ENDIAN
+#else
+#ifdef WORDS_BIGENDIAN
+#else // LITTLE_ENDIAN
+    rb_define_alias(cNArray, "hton", "swap_byte");
+    rb_define_alias(cNArray, "network_order?", "byte_swapped?");
+    rb_define_alias(cNArray, "little_endian?", "host_order?");
+    rb_define_alias(cNArray, "vacs_order?", "host_order?");
+#endif
+#endif
+    rb_define_method(cNArray, "to_network", nary_to_network, 0);
+    rb_define_method(cNArray, "to_vacs", nary_to_vacs, 0);
+    rb_define_method(cNArray, "to_host", nary_to_host, 0);
+    rb_define_method(cNArray, "to_swapped", nary_to_swapped, 0);
+
+    //rb_define_method(cNArray, "dot", numo_na_dot, 1);
+
+    id_mulsum       = rb_intern("mulsum");
+    sym_mulsum      = ID2SYM(id_mulsum);
+    id_respond_to_p = rb_intern("respond_to?");
+    id_store        = rb_intern("store");
+    id_swap_byte    = rb_intern("swap_byte");
+}
diff --git a/ext/numo/narray/depend.erb b/ext/numo/narray/depend.erb
new file mode 100644
index 0000000..2a35c76
--- /dev/null
+++ b/ext/numo/narray/depend.erb
@@ -0,0 +1,34 @@
+TAGSRC = \
+ ../../ruby/include/ruby/*.h \
+ ../../ruby/*.c \
+ *.h \
+ types/*.h \
+ *.c \
+ types/*.c
+
+tags : TAGS
+TAGS : $(TAGSRC)
+	etags $(TAGSRC)
+
+doc :
+	yard doc *.c types/*.c
+
+C_TMPL = <%=Dir.glob("gen/tmpl*/*.c").join(" ")%>
+
+COGEN = gen/cogen.rb
+DEPENDS = $(C_TMPL) gen/*.rb
+
+<%
+   type_c = []
+   type_rb = Dir.glob("gen/def/*.rb")
+   type_rb.each do |s|
+     type_c << c = "types/"+File.basename(s,".rb")+".c"
+%>
+<%=c%>: <%=s%> $(DEPENDS)
+	$(MAKEDIRS) $(@D) types
+	ruby $(COGEN) -l -o $@ <%=s%>
+<% end %>
+
+src : <%= type_c.join(" ") %>
+
+CLEANOBJS = *.o */*.o *.bak types/*.c
diff --git a/ext/numo/narray/extconf.rb b/ext/numo/narray/extconf.rb
new file mode 100644
index 0000000..47a7684
--- /dev/null
+++ b/ext/numo/narray/extconf.rb
@@ -0,0 +1,97 @@
+require 'rbconfig.rb'
+require 'mkmf'
+require "erb"
+
+if RUBY_VERSION < "2.0.0"
+  puts "Numo::NArray requires Ruby version 2.0 or later."
+  exit(1)
+end
+
+rm_f 'numo/extconf.h'
+
+#$CFLAGS="-g3 -O0 -Wall"
+#$CFLAGS=" $(cflags) -O3 -m64 -msse2 -funroll-loops"
+#$CFLAGS=" $(cflags) -O3"
+$INCFLAGS = "-Itypes #$INCFLAGS"
+
+$INSTALLFILES = Dir.glob(%w[numo/*.h numo/types/*.h]).map{|x| [x,'$(archdir)'] }
+$INSTALLFILES << ['numo/extconf.h','$(archdir)']
+if /cygwin|mingw/ =~ RUBY_PLATFORM
+  $INSTALLFILES << ['libnarray.a', '$(archdir)']
+end
+
+srcs = %w(
+narray
+array
+step
+index
+ndloop
+data
+types/bit
+types/int8
+types/int16
+types/int32
+types/int64
+types/uint8
+types/uint16
+types/uint32
+types/uint64
+types/sfloat
+types/dfloat
+types/scomplex
+types/dcomplex
+types/robject
+math
+SFMT
+struct
+rand
+)
+
+if have_header("stdbool.h")
+  stdbool = "stdbool.h"
+else
+  stdbool = nil
+end
+
+if have_header("stdint.h")
+  stdint = "stdint.h"
+elsif have_header("sys/types.h")
+  stdint = "sys/types.h"
+else
+  stdint = nil
+end
+
+have_type("bool", stdbool)
+unless have_type("u_int8_t", stdint)
+  have_type("uint8_t",stdint)
+end
+unless have_type("u_int16_t", stdint)
+  have_type("uint16_t",stdint)
+end
+have_type("int32_t", stdint)
+unless have_type("u_int32_t", stdint)
+  have_type("uint32_t",stdint)
+end
+have_type("int64_t", stdint)
+unless have_type("u_int64_t", stdint)
+  have_type("uint64_t", stdint)
+end
+have_func("exp10")
+
+have_var("rb_cComplex")
+
+$objs = srcs.collect{|i| i+".o"}
+
+create_header('numo/extconf.h')
+
+depend_path = File.join(__dir__, "depend")
+File.open(depend_path, "w") do |depend|
+  depend_erb_path = File.join(__dir__, "depend.erb")
+  File.open(depend_erb_path, "r") do |depend_erb|
+    erb = ERB.new(depend_erb.read)
+    erb.filename = depend_erb_path
+    depend.print(erb.result)
+  end
+end
+
+create_makefile('numo/narray')
diff --git a/ext/numo/narray/gen/cogen.rb b/ext/numo/narray/gen/cogen.rb
new file mode 100644
index 0000000..20264c3
--- /dev/null
+++ b/ext/numo/narray/gen/cogen.rb
@@ -0,0 +1,56 @@
+#! /usr/bin/env ruby
+
+thisdir = File.dirname(__FILE__)
+libpath = File.absolute_path(File.dirname(__FILE__))+"/../../../../lib"
+$LOAD_PATH.unshift libpath
+
+require_relative "./narray_def"
+
+while true
+  if ARGV[0] == "-l"
+    require "erbpp/line_number"
+    ARGV.shift
+  elsif ARGV[0] == "-o"
+    ARGV.shift
+    $output = ARGV.shift
+    require "fileutils"
+    FileUtils.rm_f($output)
+  else
+    break
+  end
+end
+
+if ARGV.size != 1
+  puts "usage:\n  ruby #{$0} [-l] erb_base [type_file]"
+  exit 1
+end
+
+type_file, = ARGV
+type_name = File.basename(type_file,".rb")
+
+erb_dir = ["tmpl"]
+erb_dir.unshift("tmpl_bit") if (type_name == "bit")
+erb_dir.map!{|d| File.join(thisdir,d)}
+
+code = DefLib.new do
+  set erb_dir: erb_dir
+  set erb_suffix: ".c"
+  set ns_var: "mNumo"
+
+  set file_name: $output||""
+  set include_files: ["numo/types/#{type_name}.h"]
+  set lib_name: "numo_"+type_name
+
+  def_class do
+    extend NArrayMethod
+    extend NArrayType
+    eval File.read(type_file), binding, type_file
+    eval File.read(File.join(thisdir,"spec.rb")), binding, "spec.rb"
+  end
+end.result
+
+if $output
+  open($output,"w").write(code)
+else
+  $stdout.write(code)
+end
diff --git a/ext/numo/narray/gen/def/bit.rb b/ext/numo/narray/gen/def/bit.rb
new file mode 100644
index 0000000..9173546
--- /dev/null
+++ b/ext/numo/narray/gen/def/bit.rb
@@ -0,0 +1,36 @@
+set name:                "bit"
+set type_name:           "bit"
+set full_class_name:     "Numo::Bit"
+set class_name:          "Bit"
+set class_alias:         nil
+set class_var:           "cT"
+set ctype:               "BIT_DIGIT"
+
+set has_math:      false
+set is_bit:        true
+set is_int:        false
+set is_unsigned:   true
+set is_float:      false
+set is_complex:    false
+set is_object:     false
+set is_real:       false
+set is_comparable: false
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64",  "Int64"
+upcast "Int32",  "Int32"
+upcast "Int16",  "Int16"
+upcast "Int8",   "Int8"
+upcast "UInt64", "UInt64"
+upcast "UInt32", "UInt32"
+upcast "UInt16", "UInt16"
+upcast "UInt8",  "UInt8"
diff --git a/ext/numo/narray/gen/def/dcomplex.rb b/ext/numo/narray/gen/def/dcomplex.rb
new file mode 100644
index 0000000..7ced724
--- /dev/null
+++ b/ext/numo/narray/gen/def/dcomplex.rb
@@ -0,0 +1,38 @@
+set name:                "dcomplex"
+set type_name:           "dcomplex"
+set full_class_name:     "Numo::DComplex"
+set class_name:          "DComplex"
+set class_alias:         "Complex64"
+set class_var:           "cT"
+set ctype:               "dcomplex"
+set real_class_name:     "DFloat"
+set real_ctype:          "double"
+
+set has_math:            true
+set is_bit:              false
+set is_int:              false
+set is_unsigned:         false
+set is_float:            true
+set is_real:             false
+set is_complex:          true
+set is_object:           false
+set is_comparable:       false
+set is_double_precision: true
+
+upcast_rb "Integer"
+upcast_rb "Float"
+upcast_rb "Complex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "DComplex"
+upcast "DFloat",   "DComplex"
+upcast "SFloat",   "DComplex"
+upcast "Int64",    "DComplex"
+upcast "Int32",    "DComplex"
+upcast "Int16",    "DComplex"
+upcast "Int8",     "DComplex"
+upcast "UInt64",   "DComplex"
+upcast "UInt32",   "DComplex"
+upcast "UInt16",   "DComplex"
+upcast "UInt8",    "DComplex"
diff --git a/ext/numo/narray/gen/def/dfloat.rb b/ext/numo/narray/gen/def/dfloat.rb
new file mode 100644
index 0000000..fcbe812
--- /dev/null
+++ b/ext/numo/narray/gen/def/dfloat.rb
@@ -0,0 +1,36 @@
+set name:                "dfloat"
+set type_name:           "dfloat"
+set full_class_name:     "Numo::DFloat"
+set class_name:          "DFloat"
+set class_alias:         "Float64"
+set class_var:           "cT"
+set ctype:               "double"
+
+set has_math:            true
+set is_bit:              false
+set is_int:              false
+set is_unsigned:         false
+set is_float:            true
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: true
+
+upcast_rb "Integer"
+upcast_rb "Float"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "DComplex"
+upcast "DFloat",   "DFloat"
+upcast "SFloat",   "DFloat"
+upcast "Int64",    "DFloat"
+upcast "Int32",    "DFloat"
+upcast "Int16",    "DFloat"
+upcast "Int8",     "DFloat"
+upcast "UInt64",   "DFloat"
+upcast "UInt32",   "DFloat"
+upcast "UInt16",   "DFloat"
+upcast "UInt8",    "DFloat"
diff --git a/ext/numo/narray/gen/def/int16.rb b/ext/numo/narray/gen/def/int16.rb
new file mode 100644
index 0000000..99fae54
--- /dev/null
+++ b/ext/numo/narray/gen/def/int16.rb
@@ -0,0 +1,35 @@
+set name:                "int16"
+set type_name:           "int16"
+set full_class_name:     "Numo::Int16"
+set class_name:          "Int16"
+set class_var:           "cT"
+set ctype:               "int16_t"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         false
+set is_float:            false
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64",  "Int64"
+upcast "Int32",  "Int32"
+upcast "Int16"
+upcast "Int8"
+upcast "UInt64", "Int64"
+upcast "UInt32", "Int32"
+upcast "UInt16"
+upcast "UInt8"
diff --git a/ext/numo/narray/gen/def/int32.rb b/ext/numo/narray/gen/def/int32.rb
new file mode 100644
index 0000000..dc519b0
--- /dev/null
+++ b/ext/numo/narray/gen/def/int32.rb
@@ -0,0 +1,35 @@
+set name:                "int32"
+set type_name:           "int32"
+set full_class_name:     "Numo::Int32"
+set class_name:          "Int32"
+set class_var:           "cT"
+set ctype:               "int32_t"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         false
+set is_float:            false
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64",  "Int64"
+upcast "Int32"
+upcast "Int16"
+upcast "Int8"
+upcast "UInt64", "Int64"
+upcast "UInt32"
+upcast "UInt16"
+upcast "UInt8"
diff --git a/ext/numo/narray/gen/def/int64.rb b/ext/numo/narray/gen/def/int64.rb
new file mode 100644
index 0000000..221bad2
--- /dev/null
+++ b/ext/numo/narray/gen/def/int64.rb
@@ -0,0 +1,35 @@
+set name:                "int64"
+set type_name:           "int64"
+set full_class_name:     "Numo::Int64"
+set class_name:          "Int64"
+set class_var:           "cT"
+set ctype:               "int64_t"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         false
+set is_float:            false
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64"
+upcast "Int32"
+upcast "Int16"
+upcast "Int8"
+upcast "UInt64"
+upcast "UInt32"
+upcast "UInt16"
+upcast "UInt8"
diff --git a/ext/numo/narray/gen/def/int8.rb b/ext/numo/narray/gen/def/int8.rb
new file mode 100644
index 0000000..121a205
--- /dev/null
+++ b/ext/numo/narray/gen/def/int8.rb
@@ -0,0 +1,35 @@
+set name:                "int8"
+set type_name:           "int8"
+set full_class_name:     "Numo::Int8"
+set class_name:          "Int8"
+set class_var:           "cT"
+set ctype:               "int8_t"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         false
+set is_float:            false
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64",  "Int64"
+upcast "Int32",  "Int32"
+upcast "Int16",  "Int16"
+upcast "Int8",   "Int8"
+upcast "UInt64", "Int64"
+upcast "UInt32", "Int64"
+upcast "UInt16", "Int32"
+upcast "UInt8",  "Int16"
diff --git a/ext/numo/narray/gen/def/robject.rb b/ext/numo/narray/gen/def/robject.rb
new file mode 100644
index 0000000..e0996ca
--- /dev/null
+++ b/ext/numo/narray/gen/def/robject.rb
@@ -0,0 +1,36 @@
+set name:                "robject"
+set type_name:           "robject"
+set full_class_name:     "Numo::RObject"
+set class_name:          "RObject"
+set class_var:           "cT"
+set ctype:               "VALUE"
+set real_class_name:     "RObject"
+set real_ctype:          "VALUE"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         false
+set is_float:            true
+set is_real:             true
+set is_complex:          false
+set is_object:           true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float"
+upcast_rb "Complex"
+
+upcast "DComplex", "RObject"
+upcast "SComplex", "RObject"
+upcast "DFloat",   "RObject"
+upcast "SFloat",   "RObject"
+upcast "Int64",    "RObject"
+upcast "Int32",    "RObject"
+upcast "Int16",    "RObject"
+upcast "Int8",     "RObject"
+upcast "UInt64",   "RObject"
+upcast "UInt32",   "RObject"
+upcast "UInt16",   "RObject"
+upcast "UInt8",    "RObject"
diff --git a/ext/numo/narray/gen/def/scomplex.rb b/ext/numo/narray/gen/def/scomplex.rb
new file mode 100644
index 0000000..b0ceffb
--- /dev/null
+++ b/ext/numo/narray/gen/def/scomplex.rb
@@ -0,0 +1,38 @@
+set name:                "scomplex"
+set type_name:           "scomplex"
+set full_class_name:     "Numo::SComplex"
+set class_name:          "SComplex"
+set class_alias:         "Complex32"
+set class_var:           "cT"
+set ctype:               "Scomplex"
+set real_class_name:     "SFloat"
+set real_ctype:          "float"
+
+set has_math:            true
+set is_bit:              false
+set is_int:              false
+set is_unsigned:         false
+set is_float:            true
+set is_real:             false
+set is_complex:          true
+set is_object:           false
+set is_comparable:       false
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float"
+upcast_rb "Complex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat",   "DComplex"
+upcast "SFloat",   "SComplex"
+upcast "Int64",    "SComplex"
+upcast "Int32",    "SComplex"
+upcast "Int16",    "SComplex"
+upcast "Int8",     "SComplex"
+upcast "UInt64",   "SComplex"
+upcast "UInt32",   "SComplex"
+upcast "UInt16",   "SComplex"
+upcast "UInt8",    "SComplex"
diff --git a/ext/numo/narray/gen/def/sfloat.rb b/ext/numo/narray/gen/def/sfloat.rb
new file mode 100644
index 0000000..47be181
--- /dev/null
+++ b/ext/numo/narray/gen/def/sfloat.rb
@@ -0,0 +1,36 @@
+set name:                "sfloat"
+set type_name:           "sfloat"
+set full_class_name:     "Numo::SFloat"
+set class_name:          "SFloat"
+set class_alias:         "Float32"
+set class_var:           "cT"
+set ctype:               "float"
+
+set has_math:            true
+set is_bit:              false
+set is_int:              false
+set is_unsigned:         false
+set is_float:            true
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float"
+upcast_rb "Complex", "SComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat",   "DFloat"
+upcast "SFloat",   "SFloat"
+upcast "Int64",    "SFloat"
+upcast "Int32",    "SFloat"
+upcast "Int16",    "SFloat"
+upcast "Int8",     "SFloat"
+upcast "UInt64",   "SFloat"
+upcast "UInt32",   "SFloat"
+upcast "UInt16",   "SFloat"
+upcast "UInt8",    "SFloat"
diff --git a/ext/numo/narray/gen/def/uint16.rb b/ext/numo/narray/gen/def/uint16.rb
new file mode 100644
index 0000000..83012c6
--- /dev/null
+++ b/ext/numo/narray/gen/def/uint16.rb
@@ -0,0 +1,35 @@
+set name:                "uint16"
+set type_name:           "uint16"
+set full_class_name:     "Numo::UInt16"
+set class_name:          "UInt16"
+set class_var:           "cT"
+set ctype:               "u_int16_t"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         true
+set is_float:            false
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64",  "Int64"
+upcast "Int32",  "Int32"
+upcast "Int16",  "Int16"
+upcast "Int8",   "Int16"
+upcast "UInt64", "UInt64"
+upcast "UInt32", "UInt32"
+upcast "UInt16"
+upcast "UInt8"
diff --git a/ext/numo/narray/gen/def/uint32.rb b/ext/numo/narray/gen/def/uint32.rb
new file mode 100644
index 0000000..1269c4e
--- /dev/null
+++ b/ext/numo/narray/gen/def/uint32.rb
@@ -0,0 +1,35 @@
+set name:                "uint32"
+set type_name:           "uint32"
+set full_class_name:     "Numo::UInt32"
+set class_name:          "UInt32"
+set class_var:           "cT"
+set ctype:               "u_int32_t"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         true
+set is_float:            false
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64",  "Int64"
+upcast "Int32",  "Int32"
+upcast "Int16",  "Int32"
+upcast "Int8",   "Int32"
+upcast "UInt64", "UInt64"
+upcast "UInt32"
+upcast "UInt16"
+upcast "UInt8"
diff --git a/ext/numo/narray/gen/def/uint64.rb b/ext/numo/narray/gen/def/uint64.rb
new file mode 100644
index 0000000..4db9bef
--- /dev/null
+++ b/ext/numo/narray/gen/def/uint64.rb
@@ -0,0 +1,35 @@
+set name:                "uint64"
+set type_name:           "uint64"
+set full_class_name:     "Numo::UInt64"
+set class_name:          "UInt64"
+set class_var:           "cT"
+set ctype:               "u_int64_t"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         true
+set is_float:            false
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64",  "Int64"
+upcast "Int32",  "Int64"
+upcast "Int16",  "Int64"
+upcast "Int8",   "Int64"
+upcast "UInt64"
+upcast "UInt32"
+upcast "UInt16"
+upcast "UInt8"
diff --git a/ext/numo/narray/gen/def/uint8.rb b/ext/numo/narray/gen/def/uint8.rb
new file mode 100644
index 0000000..39ed153
--- /dev/null
+++ b/ext/numo/narray/gen/def/uint8.rb
@@ -0,0 +1,35 @@
+set name:                "uint8"
+set type_name:           "uint8"
+set full_class_name:     "Numo::UInt8"
+set class_name:          "UInt8"
+set class_var:           "cT"
+set ctype:               "u_int8_t"
+
+set has_math:            false
+set is_bit:              false
+set is_int:              true
+set is_unsigned:         true
+set is_float:            false
+set is_complex:          false
+set is_object:           false
+set is_real:             true
+set is_comparable:       true
+set is_double_precision: false
+
+upcast_rb "Integer"
+upcast_rb "Float", "DFloat"
+upcast_rb "Complex", "DComplex"
+
+upcast "RObject",  "RObject"
+upcast "DComplex", "DComplex"
+upcast "SComplex", "SComplex"
+upcast "DFloat", "DFloat"
+upcast "SFloat", "SFloat"
+upcast "Int64",  "Int64"
+upcast "Int32",  "Int32"
+upcast "Int16",  "Int16"
+upcast "Int8",   "Int8"
+upcast "UInt64", "UInt64"
+upcast "UInt32", "UInt32"
+upcast "UInt16", "UInt16"
+upcast "UInt8"
diff --git a/ext/numo/narray/gen/erbpp2.rb b/ext/numo/narray/gen/erbpp2.rb
new file mode 100644
index 0000000..58b35d7
--- /dev/null
+++ b/ext/numo/narray/gen/erbpp2.rb
@@ -0,0 +1,325 @@
+require "erb"
+
+class ErbPP
+
+  def initialize(parent=nil, erb_base=nil, **opts, &block)
+    @parent = parent
+    @children = []
+    @opts = opts
+    set erb_base: erb_base if erb_base
+    @parent.add_child(self) if @parent
+    instance_eval(&block) if block
+  end
+
+  attr_reader :children
+  attr_accessor :parent
+
+  def add_child(child)
+    @children.push(child)
+  end
+
+  def set(**opts)
+    @opts.merge!(opts)
+  end
+
+  def get(key, *args, &block)
+    if respond_to?(key)
+      return send(key, *args, &block)
+    end
+    if args.empty? && block.nil? && @opts.has_key?(key)
+      return @opts[key]
+    end
+    if @parent
+      return @parent.get(key, *args, &block)
+    end
+    nil
+  end
+
+  def description
+    if s = @opts[:description] || @opts[:desc]
+      s.gsub(/\@\{/,"[").gsub(/\@\}/,"]")
+    end
+  end
+
+  alias desc description
+
+  alias method_missing_alias method_missing
+
+  def method_missing(_meth_id, *args, &block)
+    if args.empty?
+      #$stderr.puts _meth_id.inspect
+      v = get(_meth_id, *args, &block)
+      return v if !v.nil?
+    end
+    method_missing_alias(_meth_id, *args, &block)
+  end
+
+  # ERB Loader
+
+  def load_erb(base_name)
+    safe_level = nil
+    trim_mode = '%<>'
+    file = base_name + get(:erb_suffix)
+    dirs = get(:erb_dir)
+    dirs = [dirs] if !dirs.kind_of?(Array)
+    dirs.each do |x|
+      Dir.glob(x).each do |dir|
+        path = File.join(dir,file)
+        if File.exist?(path)
+          erb = ERB.new(File.read(path), safe_level, trim_mode)
+          erb.filename = path
+          return erb
+        end
+      end
+    end
+    raise "file not found: #{file.inspect} in #{dirs.inspect}"
+  end
+
+  def run
+    if base = @opts[:erb_base]
+      load_erb(base).run(binding)
+    end
+  end
+
+  def result
+    if base = @opts[:erb_base]
+      load_erb(base).result(binding)
+    end
+  end
+
+  def write(output)
+    File.open(output,"wt") do |f|
+      f.print(result)
+    end
+  end
+
+  def init_def
+  end
+
+  def find_tmpl(name)
+    @parent.children.find{|x| x.name == name }
+  end
+
+  def find(name)
+    children.find{|x| x.name == name }
+  end
+end
+
+
+class DefLib < ErbPP
+  def initialize(parent=nil, **opts, &block)
+    opts[:erb_base] ||= 'lib'
+    opts[:include_files] ||= []
+    super(parent, **opts, &block)
+  end
+  def id_assign
+    ids = []
+    @children.each{|c| a=c.get(:id_list); ids.concat(a) if a}
+    ids.sort.uniq.map{|x| "id_#{x[1]} = rb_intern(\"#{x[0]}\");"}
+  end
+  def id_decl
+    ids = []
+    @children.each{|c| a=c.get(:id_list); ids.concat(a) if a}
+    ids.sort.uniq.map{|x| "static ID id_#{x[1]};\n"}
+  end
+  def def_class(**opts, &block)
+    DefClass.new(self, **opts, &block)
+  end
+  def def_module(**opts, &block)
+    DefModule.new(self, **opts, &block)
+  end
+end
+
+module DeclMethod
+  def def_alloc_func(m, erb_path=nil, **opts, &block)
+    DefAllocFunc.new(self, erb_path||m, name:m, singleton:true, **opts, &block)
+  end
+  def undef_alloc_func
+    UndefAllocFunc.new(self)
+  end
+  def def_method(m, erb_path=nil, **opts, &block)
+    DefMethod.new(self, erb_path||m, name:m, **opts, &block)
+  end
+  def undef_method(m)
+    UndefMethod.new(self,name:m)
+  end
+  def def_singleton_method(m, erb_path=nil, **opts, &block)
+    DefMethod.new(self, erb_path||m, name:m, singleton:true, **opts, &block)
+  end
+  def undef_singleton_method(m)
+    UndefSingletonMethod.new(self,name:m)
+  end
+  def def_module_function(m, erb_path=nil, **opts, &block)
+    DefModuleFunction.new(self, erb_path||m, name:m, **opts, &block)
+  end
+  def def_alias(from, to)
+    DefAlias.new(self, from:from, to:to)
+  end
+  def def_const(m, v, **opts, &block)
+    DefConst.new(self, name:m, value:v, **opts, &block)
+  end
+end
+
+class DefModule < ErbPP
+  include DeclMethod
+  def initialize(parent, **opts, &block)
+    eb = opts[:erb_base] || 'module'
+    super(parent, erb_base:eb, **opts, &block)
+  end
+  def id_list
+    @id_list ||= []
+  end
+  def def_id(name,var=nil)
+    var = name.gsub(/\?/,"_p").gsub(/\!/,"_bang") if var.nil?
+    id_list << [name,var]
+  end
+  def init_def
+    load_erb(init_erb).result(binding)
+  end
+  def init_erb
+    @opts[:init_erb] || "init_module"
+  end
+  def method_code
+    @children.map{|c| c.result}.join("\n")
+  end
+  def _mod_var
+    @opts[:module_var]
+  end
+end
+
+class DefClass < DefModule
+  def initialize(parent, **opts, &block)
+    eb = opts[:erb_base] || 'class'
+    super(parent, erb_base:eb, **opts, &block)
+  end
+  def _mod_var
+    @opts[:class_var]
+  end
+  def init_erb
+    @opts[:init_erb] || "init_class"
+  end
+  def super_class
+    @opts[:super_class] || "rb_cObject"
+  end
+  def free_func
+    @opts[:free_func] || "gsl_"+get(:name)+"_free"
+  end
+end
+
+class DefMethod < ErbPP
+  include DeclMethod
+
+  def initialize(parent, erb_base, **opts, &block)
+    super(parent, **opts, &block)
+    set erb_base: erb_base
+  end
+
+  def id_op
+    if op.size == 1
+      "'#{op}'"
+    else
+      "id_#{c_name}"
+    end
+  end
+
+  def c_name
+    @opts[:name].gsub(/\?/,"_p").gsub(/\!/,"_bang").gsub(/=/,"_set")
+  end
+
+  def op_map
+    @opts[:op] || @opts[:name]
+  end
+
+  def c_func(n_arg=nil)
+    set n_arg: n_arg if n_arg
+    s = (singleton) ? "_s" : ""
+    "#{@parent.name}#{s}_#{c_name}"
+  end
+
+  def c_iter
+    "iter_#{c_func}"
+  end
+
+  def define_method_args
+    "#{_mod_var}, \"#{op_map}\", #{c_func}, #{n_arg}"
+  end
+
+  def init_def
+    return if n_arg == :nodef
+    s = (singleton) ? "_singleton" : ""
+    "rb_define#{s}_method(#{define_method_args});"
+  end
+
+  def singleton
+    @opts[:singleton]
+  end
+end
+
+class DefModuleFunction < DefMethod
+  def initialize(parent, erb_base, **opts, &block)
+    super(parent, erb_base, **opts, &block)
+    set singleton: true
+  end
+
+  def init_def
+    return if n_arg == :nodef
+    "rb_define_module_function(#{define_method_args});"
+  end
+end
+
+class DefAlias < ErbPP
+  def init_def
+    "rb_define_alias(#{_mod_var}, \"#{from}\", \"#{to}\");"
+  end
+end
+
+class DefAllocFunc < DefMethod
+  def init_def
+    "rb_define_alloc_func(#{_mod_var}, #{c_func});"
+  end
+end
+
+class UndefAllocFunc < ErbPP
+  def init_def
+    "rb_undef_alloc_func(#{_mod_var});"
+  end
+end
+
+class UndefMethod < ErbPP
+  def init_def
+    "rb_undef_method(#{_mod_var},\"#{name}\");"
+  end
+end
+
+class UndefSingletonMethod < ErbPP
+  def init_def
+    "rb_undef_method(rb_singleton_class(#{_mod_var}),\"#{name}\");"
+  end
+end
+
+class DefConst < ErbPP
+  def init_def
+    "/*#{desc}*/
+    rb_define_const(#{_mod_var},\"#{name}\",#{value});"
+  end
+end
+
+class DefStruct < ErbPP
+  def method_code
+    "static VALUE #{class_var};"
+  end
+  def init_def
+    items = members.map{|s| "\"#{s}\""}.join(",")
+    "/*#{description}*/
+    #{class_var} = rb_struct_define(\"#{class_name}\",#{items},NULL);"
+  end
+end
+
+class DefInclueModule < ErbPP
+  def initialize(parent=nil, incl_class, incl_module, **opts, &block)
+    super(parent,incl_class:incl_class,incl_module:incl_module,**opts,&block)
+  end
+  def init_def
+    "rb_include_module(#{get(:incl_class)}, #{get(:incl_module)});"
+  end
+end
diff --git a/ext/numo/narray/gen/narray_def.rb b/ext/numo/narray/gen/narray_def.rb
new file mode 100644
index 0000000..1f5278c
--- /dev/null
+++ b/ext/numo/narray/gen/narray_def.rb
@@ -0,0 +1,252 @@
+require_relative './erbpp2'
+
+module NArrayMethod
+
+  def binary(meth, ope=nil)
+    ope = meth if ope.nil?
+    def_method(meth, "binary", op:ope)
+  end
+
+  def binary2(meth, ope=nil)
+    ope = meth if ope.nil?
+    def_method(meth, "binary2", op:ope)
+  end
+
+  def unary(meth, ope=nil)
+    def_method(meth, "unary", op:ope)
+  end
+
+  def pow
+    def_method("pow", "pow", op:"**")
+  end
+
+  def unary2(meth, dtype, result_class)
+    h = {dtype:dtype, result_class:result_class}
+    def_method(meth, "unary2", **h)
+  end
+
+  def set2(meth, dtype, result_class)
+    h = {dtype:dtype, result_class:result_class}
+    def_method(meth, "set2", h)
+  end
+
+  def cond_binary(meth,op=nil)
+    op = meth unless op
+    def_method(meth, "cond_binary", op:op)
+  end
+
+  def cond_unary(meth)
+    def_method(meth, "cond_unary")
+  end
+
+  def bit_count(meth)
+    def_method(meth, "bit_count")
+  end
+
+  def bit_reduce(meth, init_bit)
+    h = {init_bit:init_bit}
+    def_method(meth, "bit_reduce", **h)
+  end
+
+  def accum(meth, dtype, result_class)
+    h = {dtype:dtype, result_class:result_class}
+    def_method(meth, "accum", **h)
+  end
+
+  def accum_index(meth)
+    def_method(meth, "accum_index")
+  end
+
+  def cum(meth, cmacro)
+    def_method(meth, "cum", cmacro:cmacro)
+  end
+
+  def accum_binary(meth, ope=nil)
+    ope = meth if ope.nil?
+    def_method(meth, "accum_binary", op:ope)
+  end
+
+  def qsort(type_name, dtype, dcast, suffix="")
+    h = {type_name:type_name, dtype:dtype, dcast:dcast, suffix:suffix}
+    def_method("qsort", **h)
+  end
+end
+
+module NMathMethod
+
+  def math(meth, n=1, tmpl=nil, **h)
+    if tmpl.nil?
+      case n
+      when 1
+        tmpl = "unary_s"
+      when 2
+        tmpl = "binary_s"
+      when 3
+        tmpl = "ternary_s"
+      else
+        raise "invalid n=#{n}"
+      end
+    end
+    def_module_function(meth, tmpl, **h)
+  end
+end
+
+# ----------------------------------------------------------------------
+
+module NArrayType
+
+  def type_name
+    @opts[:type_name] ||= class_name.downcase
+  end
+  alias tp type_name
+
+  def type_var
+    @opts[:type_var] ||= "numo_c"+class_name
+  end
+
+  def math_var
+    @opts[:math_var] ||= "numo_m"+class_name+"Math"
+  end
+
+  def real_class_name(arg=nil)
+    if arg.nil?
+      @opts[:real_class_name] ||= class_name
+    else
+      @opts[:real_class_name] = arg
+    end
+  end
+
+  def real_ctype(arg=nil)
+    if arg.nil?
+      @opts[:real_ctype] ||= ctype
+    else
+      @opts[:real_ctype] = arg
+    end
+  end
+
+  def real_type_var
+    @opts[:real_type_var] ||= "numo_c"+real_class_name
+  end
+
+  def real_type_name
+    @opts[:real_type_name] ||= real_class_name.downcase
+  end
+
+  def class_alias(*args)
+    case a = @opts[:class_alias]
+    when Array
+    when nil
+      a = @opts[:class_alias] = []
+    else
+      a = @opts[:class_alias] = [a]
+    end
+    a.concat(args)
+  end
+
+  def upcast(c=nil,t=nil)
+    @opts[:upcast] ||= []
+    if c
+      if t
+        t = "numo_c#{t}"
+      else
+        t = "cT"
+      end
+      @opts[:upcast] << "rb_hash_aset(hCast, numo_c#{c}, #{t});"
+    else
+      @opts[:upcast]
+    end
+  end
+
+  def upcast_rb(c,t=nil)
+    @opts[:upcast] ||= []
+    if t
+      t = "numo_c#{t}"
+    else
+      t = "cT"
+    end
+    if c=="Integer"
+      @opts[:upcast] << "#ifdef RUBY_INTEGER_UNIFICATION"
+      @opts[:upcast] << "rb_hash_aset(hCast, rb_cInteger, #{t});"
+      @opts[:upcast] << "#else"
+      @opts[:upcast] << "rb_hash_aset(hCast, rb_cFixnum, #{t});"
+      @opts[:upcast] << "rb_hash_aset(hCast, rb_cBignum, #{t});"
+      @opts[:upcast] << "#endif"
+    else
+      @opts[:upcast] << "rb_hash_aset(hCast, rb_c#{c}, #{t});"
+    end
+  end
+end
+
+# ----------------------------------------------------------------------
+
+module StoreFrom
+
+  def store_from(cname, dtype=nil, macro=nil)
+    tmpl = (cname=="Bit") ? "store_bit" : "store_from"
+    h = { name:cname.downcase,
+          type_name:cname,
+          type_var:"numo_c"+cname,
+          dtype:dtype,
+          macro:macro }
+    Store.new(self, tmpl, **h)
+  end
+
+  def store_numeric
+    StoreNum.new(self, "store_numeric", name:"numeric")
+  end
+
+  def store_array
+    StoreArray.new(self, "store_array", name:"array")
+  end
+
+  def definitions
+    a = []
+    @children.each do |x|
+      if x.condition("")
+        if x.get(:type_name) == parent.class_name
+          a.unshift(x)
+        else
+          a.push(x)
+        end
+      end
+    end
+    a
+  end
+end
+
+# ----------------------------------------------------------------------
+
+class Store < DefMethod
+  def c_func(n=nil)
+    "#{parent.parent.name}_store_#{name}"
+  end
+
+  def condition(klass)
+    "#{klass}==#{type_var}"
+  end
+
+  def extract_data(ptr,pos,x)
+    case type_name
+    when "Bit"
+      "{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_real(b);}"
+    when "RObject"
+      "#{x} = m_num_to_data(*(#{dtype}*)(#{ptr}+#{pos}))"
+    when /Complex/
+      "{#{dtype} *p = (#{dtype}*)(#{ptr}+#{pos}); #{x} = c_new(REAL(*p),IMAG(*p));}"
+    else
+      "#{x} = m_from_real(*(#{dtype}*)(#{ptr}+#{pos}))"
+    end
+  end
+end
+
+class StoreNum < Store
+  def condition(klass)
+    "IS_INTEGER_CLASS(#{klass}) || #{klass}==rb_cFloat || #{klass}==rb_cComplex"
+  end
+end
+
+class StoreArray < Store
+  def condition(klass)
+    "#{klass}==rb_cArray"
+  end
+end
diff --git a/ext/numo/narray/gen/spec.rb b/ext/numo/narray/gen/spec.rb
new file mode 100644
index 0000000..6b1e9d4
--- /dev/null
+++ b/ext/numo/narray/gen/spec.rb
@@ -0,0 +1,396 @@
+def_id "cast"
+def_id "eq"
+def_id "ne"
+def_id "pow"
+def_id "mulsum"
+if is_complex
+  def_id "real"
+  def_id "imag"
+else
+  def_id "divmod"
+end
+if is_float
+  def_id "nearly_eq"
+  def_id "copysign"
+end
+if is_int
+  def_id "<<","left_shift"
+  def_id ">>","right_shift"
+end
+if is_comparable && !is_object
+  def_id "gt"
+  def_id "ge"
+  def_id "lt"
+  def_id "le"
+end
+if is_object
+  def_id "bit_and"
+  def_id "bit_or"
+  def_id "bit_xor"
+  def_id "bit_not"
+  def_id "abs"
+  def_id "minus"
+  def_id "reciprocal"
+  def_id "square"
+  def_id "floor"
+  def_id "round"
+  def_id "ceil"
+  def_id "truncate"
+  def_id "nan?"
+  def_id "infinite?"
+  def_id "finite?"
+  def_id "==","eq"
+  def_id "!=","ne"
+  def_id ">" ,"gt"
+  def_id ">=","ge"
+  def_id "<" ,"lt"
+  def_id "<=","le"
+  def_id "<=>","ufo"
+end
+
+if is_int && !is_object
+  def_id "minlength" # for bincount
+end
+
+# Constatnts
+
+if is_bit
+  def_const "ELEMENT_BIT_SIZE",  "INT2FIX(1)"
+  def_const "ELEMENT_BYTE_SIZE", "rb_float_new(1.0/8)"
+  def_const "CONTIGUOUS_STRIDE", "INT2FIX(1)"
+else
+  def_const "ELEMENT_BIT_SIZE",  "INT2FIX(sizeof(dtype)*8)"
+  def_const "ELEMENT_BYTE_SIZE", "INT2FIX(sizeof(dtype))"
+  def_const "CONTIGUOUS_STRIDE", "INT2FIX(sizeof(dtype))"
+end
+
+if !is_object
+  if is_float
+    def_const "EPSILON", "M_EPSILON"
+  end
+  if is_float || is_int
+    def_const "MAX", "M_MAX"
+    def_const "MIN", "M_MIN"
+  end
+end
+
+# Un-define
+
+if is_object
+  undef_singleton_method "from_binary"
+  undef_method "to_binary"
+  undef_method "swap_byte"
+  undef_method "to_network"
+  undef_method "to_vacs"
+  undef_method "to_host"
+  undef_method "to_swapped"
+end
+
+# Allocation
+
+def_alloc_func "alloc_func"
+def_method "allocate"
+
+# Type conversion
+
+def_method "extract"
+def_method "new_dim0"
+
+def_method "store" do
+  extend StoreFrom
+  store_numeric
+  store_from "Bit"
+  if is_complex
+    store_from "DComplex","dcomplex","m_from_dcomplex"
+    store_from "SComplex","scomplex","m_from_scomplex"
+  end
+  store_from "DFloat","double",   "m_from_real"
+  store_from "SFloat","float",    "m_from_real"
+  store_from "Int64", "int64_t",  "m_from_real"
+  store_from "Int32", "int32_t",  "m_from_real"
+  store_from "Int16", "int16_t",  "m_from_real"
+  store_from "Int8",  "int8_t",   "m_from_real"
+  store_from "UInt64","u_int64_t","m_from_real"
+  store_from "UInt32","u_int32_t","m_from_real"
+  store_from "UInt16","u_int16_t","m_from_real"
+  store_from "UInt8", "u_int8_t", "m_from_real"
+  store_from "RObject", "VALUE",  "m_num_to_data"
+  store_array
+end
+
+def_method "extract_data"
+
+def_method "cast_array"
+def_singleton_method "cast"
+
+def_method "aref", op:"[]"
+def_method "aset", op:"[]="
+
+def_method "coerce_cast"
+def_method "to_a"
+def_method "fill"
+def_method "format"
+def_method "format_to_a"
+def_method "inspect"
+
+
+# Array manipulation
+
+def_method "each"
+unary "map" if !is_bit
+def_method "each_with_index"
+
+if is_bit
+  unary  "copy"
+  unary  "not", "~"
+  binary "and", "&"
+  binary "or" , "|"
+  binary "xor", "^"
+  binary "eq"
+  bit_count "count_true"
+  def_alias "count_1","count_true"
+  def_alias "count","count_true"
+  bit_count "count_false"
+  def_alias "count_0","count_false"
+  bit_reduce "all?", 1
+  bit_reduce "any?", 0
+  def_method "none?", "none_p"
+  def_method "where"
+  def_method "where2"
+  def_method "mask"
+else
+
+def_method "map_with_index"
+
+# Arithmetic
+
+unary2 "abs", "rtype", "cRT"
+
+binary "add", "+"
+binary "sub", "-"
+binary "mul", "*"
+binary "div", "/"
+
+if !is_complex
+  binary "mod", "%"
+  binary2 "divmod"
+end
+
+pow
+
+unary "minus", "-@"
+unary "reciprocal"
+unary "sign"
+unary "square"
+
+# Complex
+
+if is_complex
+  unary "conj"
+  unary "im"
+  unary2 "real", "rtype", "cRT"
+  unary2 "imag", "rtype", "cRT"
+  unary2 "arg",  "rtype", "cRT"
+  def_alias "angle","arg"
+  set2 "set_imag", "rtype", "cRT"
+  set2 "set_real", "rtype", "cRT"
+  def_alias "imag=","set_imag"
+  def_alias "real=","set_real"
+else
+  def_alias "conj", "view"
+  def_alias "im", "view"
+end
+
+def_alias "conjugate","conj"
+
+# base_cond
+
+cond_binary "eq"
+cond_binary "ne"
+
+# nearly_eq  : x=~y is true if |x-y| <= (|x|+|y|)*epsilon
+if is_float
+  cond_binary "nearly_eq"
+else
+  def_alias "nearly_eq", "eq"
+end
+def_alias "close_to", "nearly_eq"
+
+# Integer
+if is_int
+  binary "bit_and", "&"
+  binary "bit_or" , "|"
+  binary "bit_xor", "^"
+  unary  "bit_not", "~"
+  binary "left_shift", "<<"
+  binary "right_shift", ">>"
+  if !is_object
+    def_alias "floor", "view"
+    def_alias "round", "view"
+    def_alias "ceil",  "view"
+    def_alias "trunc", "view"
+    def_alias "rint",  "view"
+  end
+end
+
+if is_float
+  unary "floor"
+  unary "round"
+  unary "ceil"
+  unary "trunc"
+  if !is_object
+    unary "rint"
+    binary "copysign"
+    if !is_complex
+      cond_unary "signbit"
+      def_method "modf", "unary_ret2"
+    end
+  end
+end
+
+if is_comparable
+  cond_binary "gt"
+  cond_binary "ge"
+  cond_binary "lt"
+  cond_binary "le"
+  def_alias ">", "gt"
+  def_alias ">=","ge"
+  def_alias "<", "lt"
+  def_alias "<=","le"
+  def_method "clip"
+end
+
+# Float
+
+if is_float
+  cond_unary "isnan"
+  cond_unary "isinf"
+  cond_unary "isposinf"
+  cond_unary "isneginf"
+  cond_unary "isfinite"
+end
+
+accum "sum","dtype","cT"
+accum "prod","dtype","cT"
+if is_double_precision
+  accum "kahan_sum","dtype","cT"
+end
+
+if is_float
+  accum "mean","dtype","cT"
+  accum "stddev","rtype","cRT"
+  accum "var","rtype","cRT"
+  accum "rms","rtype","cRT"
+end
+
+if is_comparable
+  accum "min","dtype","cT"
+  accum "max","dtype","cT"
+  accum "ptp","dtype","cT"
+  accum_index "max_index"
+  accum_index "min_index"
+  def_method "minmax"
+end
+
+if is_int && !is_object
+  def_method "bincount"
+end
+
+cum "cumsum","add"
+cum "cumprod","mul"
+
+# dot
+accum_binary "mulsum"
+
+# rmsdev
+# prod
+
+# shuffle
+# histogram
+
+def_method "seq"
+if is_float
+  def_method "logseq"
+end
+def_method "eye"
+def_alias  "indgen", "seq"
+
+def_method "rand"
+if is_float && !is_object
+  def_method "rand_norm"
+end
+
+# y = a[0] + a[1]*x + a[2]*x^2 + a[3]*x^3 + ... + a[n]*x^n
+def_method "poly"
+
+if is_comparable && !is_object
+  if is_float
+    qsort type_name,"dtype","*(dtype*)","_prnan"
+    qsort type_name,"dtype","*(dtype*)","_ignan"
+  else
+    qsort type_name,"dtype","*(dtype*)"
+  end
+  def_method "sort"
+  if is_float
+    qsort type_name+"_index","dtype*","**(dtype**)","_prnan"
+    qsort type_name+"_index","dtype*","**(dtype**)","_ignan"
+  else
+    qsort type_name+"_index","dtype*","**(dtype**)"
+  end
+  def_method "sort_index"
+  def_method "median"
+end
+
+# Math
+# histogram
+
+if has_math
+fn = get(:full_class_name)
+cn = get(:class_name)
+nm = get(:name)
+is_c = is_complex
+
+def_module do
+  extend NMathMethod
+  set ns_var: "cT"
+  set class_name: cn
+  set name: "#{nm}_math"
+  set full_module_name: fn+"::NMath"
+  set module_name: "Math"
+  set module_var: "mTM"
+
+  math "sqrt"
+  math "cbrt"
+  math "log"
+  math "log2"
+  math "log10"
+  math "exp"
+  math "exp2"
+  math "exp10"
+  math "sin"
+  math "cos"
+  math "tan"
+  math "asin"
+  math "acos"
+  math "atan"
+  math "sinh"
+  math "cosh"
+  math "tanh"
+  math "asinh"
+  math "acosh"
+  math "atanh"
+  math "sinc"
+  if !is_c
+    math "atan2",2
+    math "hypot",2
+    math "erf"
+    math "erfc"
+    math "log1p"
+    math "expm1"
+    math "ldexp",2
+    math "frexp",1,"frexp"
+  end
+end
+end
+
+end # other than Bit
diff --git a/ext/numo/narray/gen/tmpl/accum.c b/ext/numo/narray/gen/tmpl/accum.c
new file mode 100644
index 0000000..feb2e54
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/accum.c
@@ -0,0 +1,48 @@
+<% (is_float ? ["","_nan"] : [""]).each do |j| %>
+static void
+<%=c_iter%><%=j%>(na_loop_t *const lp)
+{
+    size_t   n;
+    char    *p1, *p2;
+    ssize_t  s1;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR(lp, 0, p1, s1);
+    p2 = lp->args[1].ptr + lp->args[1].iter[0].pos;
+
+    *(<%=dtype%>*)p2 = f_<%=name%><%=j%>(n,p1,s1);
+}
+<% end %>
+
+/*
+  <%=name%> of self.
+<% if is_float %>
+  @overload <%=name%>(axis:nil, keepdims:false, nan:false)
+  @param [TrueClass] nan  If true, apply NaN-aware algorithm (avoid NaN for sum/mean etc, or, return NaN for min/max etc).
+<% else %>
+  @overload <%=name%>(axis:nil, keepdims:false)
+<% end %>
+  @param [Numeric,Array,Range] axis (keyword) Affected dimensions.
+  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
+  @return [Numo::<%=class_name%>] returns result of <%=name%>.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE v, reduce;
+    ndfunc_arg_in_t ain[2] = {{cT,0},{sym_reduce,0}};
+    ndfunc_arg_out_t aout[1] = {{<%=result_class%>,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP_NIP|NDF_FLAT_REDUCE, 2, 1, ain, aout };
+
+  <% if is_float %>
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, <%=c_iter%>_nan);
+  <% else %>
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+  <% end %>
+    v =  na_ndloop(&ndf, 2, self, reduce);
+  <% if result_class == "cT" %>
+    return <%=type_name%>_extract(v);
+  <% else %>
+    return rb_funcall(v,rb_intern("extract"),0);
+  <% end %>
+}
diff --git a/ext/numo/narray/gen/tmpl/accum_binary.c b/ext/numo/narray/gen/tmpl/accum_binary.c
new file mode 100644
index 0000000..ed2579f
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/accum_binary.c
@@ -0,0 +1,96 @@
+<% (is_float ? ["","_nan"] : [""]).each do |j| %>
+static void
+<%=c_iter%><%=j%>(na_loop_t *const lp)
+{
+    size_t   i;
+    char    *p1, *p2, *p3;
+    ssize_t  s1, s2, s3;
+    dtype    x, y, z;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    if (s3==0) {
+        // Reduce loop
+        GET_DATA(p3,dtype,z);
+        for (; i--;) {
+            GET_DATA_STRIDE(p1,s1,dtype,x);
+            GET_DATA_STRIDE(p2,s2,dtype,y);
+            m_<%=name%><%=j%>(x,y,z);
+        }
+        SET_DATA(p3,dtype,z);
+    } else {
+        for (; i--;) {
+            GET_DATA_STRIDE(p1,s1,dtype,x);
+            GET_DATA_STRIDE(p2,s2,dtype,y);
+            GET_DATA(p3,dtype,z);
+            m_<%=name%><%=j%>(x,y,z);
+            SET_DATA_STRIDE(p3,s3,dtype,z);
+        }
+    }
+}
+<% end %>
+
+static VALUE
+<%=c_func%>_self(int argc, VALUE *argv, VALUE self)
+{
+    VALUE v, reduce;
+    VALUE naryv[2];
+    ndfunc_arg_in_t ain[4] = {{cT,0},{cT,0},{sym_reduce,0},{sym_init,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP_NIP, 4, 1, ain, aout };
+
+    if (argc < 1) {
+        rb_raise(rb_eArgError,"wrong number of arguments (%d for >=1)",argc);
+    }
+    // should fix below: [self.ndim,other.ndim].max or?
+    naryv[0] = self;
+    naryv[1] = argv[0];
+  <% if is_float %>
+    reduce = na_reduce_dimension(argc-1, argv+1, 2, naryv, &ndf, <%=c_iter%>_nan);
+  <% else %>
+    reduce = na_reduce_dimension(argc-1, argv+1, 2, naryv, &ndf, 0);
+  <% end %>
+
+    v =  na_ndloop(&ndf, 4, self, argv[0], reduce, m_<%=name%>_init);
+    return <%=type_name%>_extract(v);
+}
+
+/*
+  Binary <%=name%>.
+
+<% if is_float %>
+  @overload <%=op_map%>(other, axis:nil, keepdims:false, nan:false)
+<% else %>
+  @overload <%=op_map%>(other, axis:nil, keepdims:false)
+<% end %>
+  @param [Numo::NArray,Numeric] other
+  @param [Numeric,Array,Range] axis (keyword) Affected dimensions.
+  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
+<% if is_float %>
+  @param [TrueClass] nan (keyword) If true, apply NaN-aware algorithm (avoid NaN if exists).
+<% end %>
+  @return [Numo::NArray] <%=name%> of self and other.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    <% if !is_object %>
+    VALUE klass, v;
+    <% end %>
+    if (argc < 1) {
+        rb_raise(rb_eArgError,"wrong number of arguments (%d for >=1)",argc);
+    }
+    <% if is_object %>
+    return <%=c_func%>_self(argc, argv, self);
+    <% else %>
+    klass = na_upcast(CLASS_OF(self),CLASS_OF(argv[0]));
+    if (klass==cT) {
+        return <%=c_func%>_self(argc, argv, self);
+    } else {
+        v = rb_funcall(klass, id_cast, 1, self);
+        return rb_funcall2(v, rb_intern("<%=name%>"), argc, argv);
+    }
+    <% end %>
+}
diff --git a/ext/numo/narray/gen/tmpl/accum_index.c b/ext/numo/narray/gen/tmpl/accum_index.c
new file mode 100644
index 0000000..2514cc7
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/accum_index.c
@@ -0,0 +1,71 @@
+<% (is_float ? ["","_nan"] : [""]).each do |j|
+   [64,32].each do |i| %>
+#define idx_t int<%=i%>_t
+static void
+<%=c_iter%>_index<%=i%><%=j%>(na_loop_t *const lp)
+{
+    size_t   n, idx;
+    char    *d_ptr, *i_ptr, *o_ptr;
+    ssize_t  d_step, i_step;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR(lp, 0, d_ptr, d_step);
+
+    idx = f_<%=name%><%=j%>(n,d_ptr,d_step);
+
+    INIT_PTR(lp, 1, i_ptr, i_step);
+    o_ptr = NDL_PTR(lp,2);
+    *(idx_t*)o_ptr = *(idx_t*)(i_ptr + i_step * idx);
+}
+#undef idx_t
+<% end;end %>
+
+/*
+  <%=name%>. Return an index of result.
+<% if is_float %>
+  @overload <%=name%>(axis:nil, nan:false)
+  @param [TrueClass] nan  If true, apply NaN-aware algorithm (return NaN posision if exist).
+<% else %>
+  @overload <%=name%>(axis:nil)
+<% end %>
+  @param [Numeric,Array,Range] axis  Affected dimensions.
+  @return [Integer,Numo::Int] returns result index of <%=name%>.
+  @example
+      Numo::NArray[3,4,1,2].min_index => 3
+ */
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    narray_t *na;
+    VALUE idx, reduce;
+    ndfunc_arg_in_t ain[3] = {{Qnil,0},{Qnil,0},{sym_reduce,0}};
+    ndfunc_arg_out_t aout[1] = {{0,0,0}};
+    ndfunc_t ndf = {0, STRIDE_LOOP_NIP|NDF_FLAT_REDUCE|NDF_EXTRACT, 3,1, ain,aout};
+
+    GetNArray(self,na);
+    if (na->ndim==0) {
+        return INT2FIX(0);
+    }
+    if (na->size > (~(u_int32_t)0)) {
+        aout[0].type = numo_cInt64;
+        idx = nary_new(numo_cInt64, na->ndim, na->shape);
+        ndf.func = <%=c_iter%>_index64;
+      <% if is_float %>
+        reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, <%=c_iter%>_index64_nan);
+      <% else %>
+        reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+      <% end %>
+    } else {
+        aout[0].type = numo_cInt32;
+        idx = nary_new(numo_cInt32, na->ndim, na->shape);
+        ndf.func = <%=c_iter%>_index32;
+      <% if is_float %>
+        reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, <%=c_iter%>_index32_nan);
+      <% else %>
+        reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+      <% end %>
+    }
+    rb_funcall(idx, rb_intern("seq"), 0);
+
+    return na_ndloop(&ndf, 3, self, idx, reduce);
+}
diff --git a/ext/numo/narray/gen/tmpl/alloc_func.c b/ext/numo/narray/gen/tmpl/alloc_func.c
new file mode 100644
index 0000000..e3ddda4
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/alloc_func.c
@@ -0,0 +1,107 @@
+static size_t
+<%=type_name%>_memsize(const void* ptr)
+{
+    size_t size = sizeof(narray_data_t);
+    const narray_data_t *na = (const narray_data_t*)ptr;
+
+    assert(na->base.type == NARRAY_DATA_T);
+
+    if (na->ptr != NULL) {
+  <% if is_bit %>
+        size += ((na->base.size-1)/8/sizeof(BIT_DIGIT)+1)*sizeof(BIT_DIGIT);
+  <% else %>
+        size += na->base.size * sizeof(dtype);
+  <% end %>
+    }
+    if (na->base.size > 0) {
+        if (na->base.shape != NULL && na->base.shape != &(na->base.size)) {
+            size += sizeof(size_t) * na->base.ndim;
+        }
+    }
+    return size;
+}
+
+static void
+<%=type_name%>_free(void* ptr)
+{
+    narray_data_t *na = (narray_data_t*)ptr;
+
+    assert(na->base.type == NARRAY_DATA_T);
+
+    if (na->ptr != NULL) {
+        xfree(na->ptr);
+        na->ptr = NULL;
+    }
+    if (na->base.size > 0) {
+        if (na->base.shape != NULL && na->base.shape != &(na->base.size)) {
+            xfree(na->base.shape);
+            na->base.shape = NULL;
+        }
+    }
+    xfree(na);
+}
+
+static narray_type_info_t <%=type_name%>_info = {
+  <% if is_bit %>
+    1,             // element_bits
+    0,             // element_bytes
+    1,             // element_stride (in bits)
+  <% else %>
+    0,             // element_bits
+    sizeof(dtype), // element_bytes
+    sizeof(dtype), // element_stride (in bytes)
+  <% end %>
+};
+
+<% if is_object %>
+static void
+<%=type_name%>_gc_mark(void *ptr)
+{
+    size_t n, i;
+    VALUE *a;
+    narray_data_t *na = ptr;
+
+    if (na->ptr) {
+        a = (VALUE*)(na->ptr);
+        n = na->base.size;
+        for (i=0; i<n; i++) {
+            rb_gc_mark(a[i]);
+        }
+    }
+}
+
+const rb_data_type_t <%=type_name%>_data_type = {
+    "<%=full_class_name%>",
+    {<%=type_name%>_gc_mark, <%=type_name%>_free, <%=type_name%>_memsize,},
+    &na_data_type,
+    &<%=type_name%>_info,
+    0, // flags
+};
+
+<% else %>
+
+const rb_data_type_t <%=type_name%>_data_type = {
+    "<%=full_class_name%>",
+    {0, <%=type_name%>_free, <%=type_name%>_memsize,},
+    &na_data_type,
+    &<%=type_name%>_info,
+    0, // flags
+};
+
+<% end %>
+
+VALUE
+<%=c_func(0)%>(VALUE klass)
+{
+    narray_data_t *na = ALLOC(narray_data_t);
+
+    na->base.ndim = 0;
+    na->base.type = NARRAY_DATA_T;
+    na->base.flag[0] = NA_FL0_INIT;
+    na->base.flag[1] = NA_FL1_INIT;
+    na->base.size = 0;
+    na->base.shape = NULL;
+    na->base.reduce = INT2FIX(0);
+    na->ptr = NULL;
+    return TypedData_Wrap_Struct(klass, &<%=type_name%>_data_type, (void*)na);
+}
diff --git a/ext/numo/narray/gen/tmpl/allocate.c b/ext/numo/narray/gen/tmpl/allocate.c
new file mode 100644
index 0000000..7dc13a0
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/allocate.c
@@ -0,0 +1,35 @@
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    narray_t *na;
+    char *ptr;
+
+    GetNArray(self,na);
+
+    switch(NA_TYPE(na)) {
+    case NARRAY_DATA_T:
+        ptr = NA_DATA_PTR(na);
+        if (na->size > 0 && ptr == NULL) {
+            ptr = xmalloc(sizeof(dtype) * na->size);
+            <% if is_object %>
+            {   size_t i;
+                VALUE *a = (VALUE*)ptr;
+                for (i=na->size; i--;) {
+                    *a++ = Qnil;
+                }
+            }
+            <% end %>
+            NA_DATA_PTR(na) = ptr;
+        }
+        break;
+    case NARRAY_VIEW_T:
+        rb_funcall(NA_VIEW_DATA(na), rb_intern("allocate"), 0);
+        break;
+    case NARRAY_FILEMAP_T:
+        //ptr = ((narray_filemap_t*)na)->ptr;
+        // to be implemented
+    default:
+        rb_bug("invalid narray type : %d",NA_TYPE(na));
+    }
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/aref.c b/ext/numo/narray/gen/tmpl/aref.c
new file mode 100644
index 0000000..a1b4c02
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/aref.c
@@ -0,0 +1,53 @@
+/*
+  Array element referenece or slice view.
+  @overload [](dim0,...,dimL)
+  @param [Numeric,Range,etc] dim0,...,dimL  Multi-dimensional Index.
+  @return [Numeric,NArray::<%=class_name%>] Element object or NArray view.
+
+  --- Returns the element at +dim0+, +dim1+, ... are Numeric indices
+  for each dimension, or returns a NArray View as a sliced subarray if
+  +dim0+, +dim1+, ... includes other than Numeric index, e.g., Range
+  or Array or true.
+
+  @example
+      a = Numo::DFloat.new(4,5).seq
+      => Numo::DFloat#shape=[4,5]
+      [[0, 1, 2, 3, 4],
+       [5, 6, 7, 8, 9],
+       [10, 11, 12, 13, 14],
+       [15, 16, 17, 18, 19]]
+
+      a[1,1]
+      => 6.0
+
+      a[1..3,1]
+      => Numo::DFloat#shape=[3]
+      [6, 11, 16]
+
+      a[1,[1,3,4]]
+      => Numo::DFloat#shape=[3]
+      [6, 8, 9]
+
+      a[true,2].fill(99)
+      a
+      => Numo::DFloat#shape=[4,5]
+      [[0, 1, 99, 3, 4],
+       [5, 6, 99, 8, 9],
+       [10, 11, 99, 13, 14],
+       [15, 16, 99, 18, 19]]
+ */
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    int nd;
+    size_t pos;
+    char *ptr;
+
+    nd = na_get_result_dimension(self, argc, argv, sizeof(dtype), &pos);
+    if (nd) {
+        return na_aref_main(argc, argv, self, 0, nd);
+    } else {
+        ptr = na_get_pointer_for_read(self) + pos;
+        return m_extract(ptr);
+    }
+}
diff --git a/ext/numo/narray/gen/tmpl/aset.c b/ext/numo/narray/gen/tmpl/aset.c
new file mode 100644
index 0000000..8e206c6
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/aset.c
@@ -0,0 +1,65 @@
+/*
+  Array element(s) set.
+  @overload []=(dim0,..,dimL,val)
+  @param [Numeric,Range,etc] dim0,..,dimL  Multi-dimensional Index.
+  @param [Numeric,Numo::NArray,etc] val  Value(s) to be set to self.
+  @return [Numeric] returns val (last argument).
+
+  --- Replace element(s) at +dim0+, +dim1+, ... (index/range/array/true
+  for each dimention). Broadcasting mechanism is applied.
+
+  @example
+      a = Numo::DFloat.new(3,4).seq
+      => Numo::DFloat#shape=[3,4]
+      [[0, 1, 2, 3],
+       [4, 5, 6, 7],
+       [8, 9, 10, 11]]
+
+      a[1,2]=99
+      a
+      => Numo::DFloat#shape=[3,4]
+      [[0, 1, 2, 3],
+       [4, 5, 99, 7],
+       [8, 9, 10, 11]]
+
+      a[1,[0,2]] = [101,102]
+      a
+      => Numo::DFloat#shape=[3,4]
+      [[0, 1, 2, 3],
+       [101, 5, 102, 7],
+       [8, 9, 10, 11]]
+
+      a[1,true]=99
+      a
+      => Numo::DFloat#shape=[3,4]
+      [[0, 1, 2, 3],
+       [99, 99, 99, 99],
+       [8, 9, 10, 11]]
+
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    int nd;
+    size_t pos;
+    char *ptr;
+    VALUE a;
+    dtype x;
+
+    argc--;
+    if (argc==0) {
+        <%=c_func.sub(/_aset/,"_store")%>(self, argv[argc]);
+    } else {
+        nd = na_get_result_dimension(self, argc, argv, sizeof(dtype), &pos);
+        if (nd) {
+            a = na_aref_main(argc, argv, self, 0, nd);
+            <%=c_func.sub(/_aset/,"_store")%>(a, argv[argc]);
+        } else {
+            x = <%=type_name%>_extract_data(argv[argc]);
+            ptr = na_get_pointer_for_read_write(self) + pos;
+            *(dtype*)ptr = x;
+        }
+
+    }
+    return argv[argc];
+}
diff --git a/ext/numo/narray/gen/tmpl/binary.c b/ext/numo/narray/gen/tmpl/binary.c
new file mode 100644
index 0000000..135f7b4
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/binary.c
@@ -0,0 +1,57 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   i, n;
+    char    *p1, *p2, *p3;
+    ssize_t  s1, s2, s3;
+    dtype    x, y;
+    INIT_COUNTER(lp, n);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    for (i=n; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,dtype,y);
+<% if is_int and %w[div mod divmod].include? name %>
+        if (y==0) {
+            lp->err_type = rb_eZeroDivError;
+            return;
+        }
+<% end %>
+        x = m_<%=name%>(x,y);
+        SET_DATA_STRIDE(p3,s3,dtype,x);
+    }
+}
+
+static VALUE
+<%=c_func%>_self(VALUE self, VALUE other)
+{
+    ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP, 2, 1, ain, aout };
+
+    return na_ndloop(&ndf, 2, self, other);
+}
+
+/*
+  Binary <%=name%>.
+  @overload <%=op_map%> other
+  @param [Numo::NArray,Numeric] other
+  @return [Numo::NArray] self <%=op_map%> other
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE other)
+{
+    <% if is_object %>
+    return <%=c_func%>_self(self, other);
+    <% else %>
+    VALUE klass, v;
+    klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
+    if (klass==cT) {
+        return <%=c_func%>_self(self, other);
+    } else {
+        v = rb_funcall(klass, id_cast, 1, self);
+        return rb_funcall(v, <%=id_op%>, 1, other);
+    }
+    <% end %>
+}
diff --git a/ext/numo/narray/gen/tmpl/binary2.c b/ext/numo/narray/gen/tmpl/binary2.c
new file mode 100644
index 0000000..e844afd
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/binary2.c
@@ -0,0 +1,59 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   i, n;
+    char    *p1, *p2, *p3, *p4;
+    ssize_t  s1, s2, s3, s4;
+    dtype    x, y, a, b;
+    INIT_COUNTER(lp, n);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    INIT_PTR(lp, 3, p4, s4);
+    for (i=n; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,dtype,y);
+<% if is_int and %w[divmod].include? name %>
+        if (y==0) {
+            lp->err_type = rb_eZeroDivError;
+            return;
+        }
+<% end %>
+        m_<%=name%>(x,y,a,b);
+        SET_DATA_STRIDE(p3,s3,dtype,a);
+        SET_DATA_STRIDE(p4,s4,dtype,b);
+    }
+}
+
+static VALUE
+<%=c_func%>_self(VALUE self, VALUE other)
+{
+    ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
+    ndfunc_arg_out_t aout[2] = {{cT,0},{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP, 2, 2, ain, aout };
+
+    return na_ndloop(&ndf, 2, self, other);
+}
+
+/*
+  Binary <%=name%>.
+  @overload <%=op_map%> other
+  @param [Numo::NArray,Numeric] other
+  @return [Numo::NArray] <%=name%> of self and other.
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE other)
+{
+    <% if is_object %>
+    return <%=c_func%>_self(self, other);
+    <% else %>
+    VALUE klass, v;
+    klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
+    if (klass==cT) {
+        return <%=c_func%>_self(self, other);
+    } else {
+        v = rb_funcall(klass, id_cast, 1, self);
+        return rb_funcall(v, <%=id_op%>, 1, other);
+    }
+    <% end %>
+}
diff --git a/ext/numo/narray/gen/tmpl/binary_s.c b/ext/numo/narray/gen/tmpl/binary_s.c
new file mode 100644
index 0000000..347ee8d
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/binary_s.c
@@ -0,0 +1,34 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char    *p1, *p2, *p3;
+    ssize_t s1, s2, s3;
+    dtype    x, y;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,dtype,y);
+        x = m_<%=name%>(x,y);
+        SET_DATA_STRIDE(p3,s3,dtype,x);
+    }
+}
+
+/*
+  Calculate <%=name%>(a1,a2).
+  @overload <%=name%>(a1,a2)
+  @param [Numo::NArray,Numeric] a1  first value
+  @param [Numo::NArray,Numeric] a2  second value
+  @return [Numo::<%=class_name%>] <%=name%>(a1,a2).
+*/
+static VALUE
+<%=c_func(2)%>(VALUE mod, VALUE a1, VALUE a2)
+{
+    ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP, 2, 1, ain, aout };
+    return na_ndloop(&ndf, 2, a1, a2);
+}
diff --git a/ext/numo/narray/gen/tmpl/bincount.c b/ext/numo/narray/gen/tmpl/bincount.c
new file mode 100644
index 0000000..9cbf164
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/bincount.c
@@ -0,0 +1,180 @@
+// ------- Integer count without weights -------
+<%
+[32,64].each do |bits|
+   cnt_cT = "numo_cUInt#{bits}"
+   cnt_type = "u_int#{bits}_t"
+%>
+static void
+<%=c_iter%>_<%=bits%>(na_loop_t *const lp)
+{
+    size_t   i, x, n;
+    char    *p1, *p2;
+    ssize_t  s1, s2;
+    size_t  *idx1;
+
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR(lp, 1, p2, s2);
+    i = lp->args[0].shape[0];
+    n = lp->args[1].shape[0];
+
+    // initialize
+    for (x=0; x < n; x++) {
+        *(<%=cnt_type%>*)(p2 + s2*x) = 0;
+    }
+
+    if (idx1) {
+        for (; i--;) {
+            GET_DATA_INDEX(p1,idx1,dtype,x);
+            (*(<%=cnt_type%>*)(p2 + s2*x))++;
+        }
+    } else {
+        for (; i--;) {
+            GET_DATA_STRIDE(p1,s1,dtype,x);
+            (*(<%=cnt_type%>*)(p2 + s2*x))++;
+        }
+    }
+}
+
+static VALUE
+<%=c_func%>_<%=bits%>(VALUE self, size_t length)
+{
+    size_t shape_out[1] = {length};
+    ndfunc_arg_in_t ain[1] = {{cT,1}};
+    ndfunc_arg_out_t aout[1] = {{<%=cnt_cT%>,1,shape_out}};
+    ndfunc_t ndf = {<%=c_iter%>_<%=bits%>, NO_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP,
+                    1, 1, ain, aout};
+
+    return na_ndloop(&ndf, 1, self);
+}
+<% end %>
+// ------- end of Integer count without weights -------
+
+// ------- Float count with weights -------
+<%
+[["SF","float"],
+ ["DF","double"]].each do |fn,cnt_type|
+  cnt_cT = "numo_c#{fn}loat"
+  fn = fn.downcase
+%>
+static void
+<%=c_iter%>_<%=fn%>(na_loop_t *const lp)
+{
+    <%=cnt_type%> w;
+    size_t   i, x, n, m;
+    char    *p1, *p2, *p3;
+    ssize_t  s1, s2, s3;
+
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    i = lp->args[0].shape[0];
+    m = lp->args[1].shape[0];
+    n = lp->args[2].shape[0];
+
+    if (i != m) {
+        rb_raise(nary_eShapeError,
+                 "size mismatch along last axis between self and weight");
+    }
+
+    // initialize
+    for (x=0; x < n; x++) {
+        *(<%=cnt_type%>*)(p3 + s3*x) = 0;
+    }
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,<%=cnt_type%>,w);
+        (*(<%=cnt_type%>*)(p3 + s3*x)) += w;
+    }
+}
+
+static VALUE
+<%=c_func%>_<%=fn%>(VALUE self, VALUE weight, size_t length)
+{
+    size_t shape_out[1] = {length};
+    ndfunc_arg_in_t ain[2] = {{cT,1},{<%=cnt_cT%>,1}};
+    ndfunc_arg_out_t aout[1] = {{<%=cnt_cT%>,1,shape_out}};
+    ndfunc_t ndf = {<%=c_iter%>_<%=fn%>, NO_LOOP|NDF_STRIDE_LOOP,
+                    2, 1, ain, aout};
+
+    return na_ndloop(&ndf, 2, self, weight);
+}
+<% end %>
+// ------- end of Float count with weights -------
+
+/*
+  Count the number of occurrences of each non-negative integer value.
+  Only Integer-types has this method.
+
+  @overload <%=name%>([weight], minlength:nil)
+  @param [SFloat or DFloat or Array] weight (optional) Array of
+    float values. Its size along last axis should be same as that of self.
+  @param [Integer] minlength (keyword, optional) Minimum size along
+    last axis for the output array.
+  @return [UInt32 or UInt64 or SFloat or DFloat]
+    Returns Float NArray if weight array is supplied,
+    otherwise returns UInt32 or UInt64 depending on the size along last axis.
+  @example
+    Numo::Int32[0..4].bincount
+    => Numo::UInt32#shape=[5]
+       [1, 1, 1, 1, 1]
+
+    Numo::Int32[0, 1, 1, 3, 2, 1, 7].bincount
+    => Numo::UInt32#shape=[8]
+       [1, 3, 1, 1, 0, 0, 0, 1]
+
+    x = Numo::Int32[0, 1, 1, 3, 2, 1, 7, 23]
+    x.bincount.size == x.max+1
+    => true
+
+    w = Numo::DFloat[0.3, 0.5, 0.2, 0.7, 1.0, -0.6]
+    x = Numo::Int32[0, 1, 1, 2, 2, 2]
+    x.bincount(w)
+    => Numo::DFloat#shape=[3]
+       [0.3, 0.7, 1.1]
+
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE weight=Qnil, kw=Qnil;
+    VALUE opts[1] = {Qundef};
+    VALUE v, wclass;
+    ID table[1] = {id_minlength};
+    size_t length, minlength;
+
+    rb_scan_args(argc, argv, "01:", &weight, &kw);
+    rb_get_kwargs(kw, table, 0, 1, opts);
+
+  <% if is_unsigned %>
+    v = <%=type_name%>_max(0,0,self);
+  <% else %>
+    v = <%=type_name%>_minmax(0,0,self);
+    if (m_num_to_data(RARRAY_AREF(v,0)) < 0) {
+        rb_raise(rb_eArgError,"array items must be non-netagive");
+    }
+    v = RARRAY_AREF(v,1);
+  <% end %>
+    length = NUM2SIZET(v) + 1;
+
+    if (opts[0] != Qundef) {
+        minlength = NUM2SIZET(opts[0]);
+        if (minlength > length) {
+            length = minlength;
+        }
+    }
+
+    if (NIL_P(weight)) {
+        if (length > 4294967295ul) {
+            return <%=c_func%>_64(self, length);
+        } else {
+            return <%=c_func%>_32(self, length);
+        }
+    } else {
+        wclass = CLASS_OF(weight);
+        if (wclass == numo_cSFloat) {
+            return <%=c_func%>_sf(self, weight, length);
+        } else {
+            return <%=c_func%>_df(self, weight, length);
+        }
+    }
+}
diff --git a/ext/numo/narray/gen/tmpl/cast.c b/ext/numo/narray/gen/tmpl/cast.c
new file mode 100644
index 0000000..4e2c7c8
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/cast.c
@@ -0,0 +1,44 @@
+<% children.each do |c|%>
+<%= c.result %>
+
+<% end %>
+/*
+  Cast object to Numo::<%=class_name%>.
+  @overload [](elements)
+  @overload <%=name%>(array)
+  @param [Numeric,Array] elements
+  @param [Array] array
+  @return [Numo::<%=class_name%>]
+*/
+static VALUE
+<%=c_func(1)%>(VALUE type, VALUE obj)
+{
+    VALUE v;
+    narray_t *na;
+    dtype x;
+
+    if (CLASS_OF(obj)==cT) {
+        return obj;
+    }
+    if (RTEST(rb_obj_is_kind_of(obj,rb_cNumeric))) {
+        x = m_num_to_data(obj);
+        return <%=type_name%>_new_dim0(x);
+    }
+    if (RTEST(rb_obj_is_kind_of(obj,rb_cArray))) {
+        return <%=find_tmpl("cast_array").c_func%>(obj);
+    }
+    if (IsNArray(obj)) {
+        GetNArray(obj,na);
+        v = nary_new(cT, NA_NDIM(na), NA_SHAPE(na));
+        if (NA_SIZE(na) > 0) {
+            <%=find_tmpl("store").c_func%>(v,obj);
+        }
+        return v;
+    }
+    <% if is_object %>
+    return robject_new_dim0(obj);
+    <% else %>
+    rb_raise(nary_eCastError,"cannot cast to %s",rb_class2name(type));
+    return Qnil;
+    <% end %>
+}
diff --git a/ext/numo/narray/gen/tmpl/cast_array.c b/ext/numo/narray/gen/tmpl/cast_array.c
new file mode 100644
index 0000000..4c22030
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/cast_array.c
@@ -0,0 +1,13 @@
+static VALUE
+<%=c_func(:nodef)%>(VALUE rary)
+{
+    VALUE nary;
+    narray_t *na;
+
+    nary = na_s_new_like(cT, rary);
+    GetNArray(nary,na);
+    if (na->size > 0) {
+        <%=find_tmpl("store").find("array").c_func%>(nary,rary);
+    }
+    return nary;
+}
diff --git a/ext/numo/narray/gen/tmpl/class.c b/ext/numo/narray/gen/tmpl/class.c
new file mode 100644
index 0000000..ab1b75b
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/class.c
@@ -0,0 +1,9 @@
+/*
+  class definition: <%= full_class_name %>
+*/
+
+VALUE <%=class_var%>;
+
+static VALUE <%= find('store').c_func %>(VALUE,VALUE);
+
+<%= method_code %>
diff --git a/ext/numo/narray/gen/tmpl/clip.c b/ext/numo/narray/gen/tmpl/clip.c
new file mode 100644
index 0000000..17e18db
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/clip.c
@@ -0,0 +1,118 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2, *p3, *p4;
+    ssize_t s1, s2, s3, s4;
+    dtype   x, min, max;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    INIT_PTR(lp, 3, p4, s4);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,dtype,min);
+        GET_DATA_STRIDE(p3,s3,dtype,max);
+        if (m_gt(min,max)) {rb_raise(nary_eOperationError,"min is greater than max");}
+        if (m_lt(x,min)) {x=min;}
+        if (m_gt(x,max)) {x=max;}
+        SET_DATA_STRIDE(p4,s4,dtype,x);
+    }
+}
+
+static void
+<%=c_iter%>_min(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2, *p3;
+    ssize_t s1, s2, s3;
+    dtype   x, min;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,dtype,min);
+        if (m_lt(x,min)) {x=min;}
+        SET_DATA_STRIDE(p3,s3,dtype,x);
+    }
+}
+
+static void
+<%=c_iter%>_max(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2, *p3;
+    ssize_t s1, s2, s3;
+    dtype   x, max;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,dtype,max);
+        if (m_gt(x,max)) {x=max;}
+        SET_DATA_STRIDE(p3,s3,dtype,x);
+    }
+}
+
+/*
+  Clip array elements by [min,max].
+  If either of min or max is nil, one side is clipped.
+  @overload <%=name%>(min,max)
+  @param [Numo::NArray,Numeric] min
+  @param [Numo::NArray,Numeric] max
+  @return [Numo::NArray] result of clip.
+
+  @example
+      a = Numo::Int32.new(10).seq
+      p a.clip(1,8)
+      # Numo::Int32#shape=[10]
+      # [1, 1, 2, 3, 4, 5, 6, 7, 8, 8]
+
+      p a
+      # Numo::Int32#shape=[10]
+      # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+      p a.inplace.clip(3,6)
+      # Numo::Int32(view)#shape=[10]
+      # [3, 3, 3, 3, 4, 5, 6, 6, 6, 6]
+
+      p a
+      # Numo::Int32#shape=[10]
+      # [3, 3, 3, 3, 4, 5, 6, 6, 6, 6]
+
+      p a = Numo::Int32.new(10).seq
+      # Numo::Int32#shape=[10]
+      # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+      p a.clip([3,4,1,1,1,4,4,4,4,4], 8)
+      # Numo::Int32#shape=[10]
+      # [3, 4, 2, 3, 4, 5, 6, 7, 8, 8]
+*/
+static VALUE
+<%=c_func(2)%>(VALUE self, VALUE min, VALUE max)
+{
+    ndfunc_arg_in_t ain[3] = {{Qnil,0},{cT,0},{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf_min = { <%=c_iter%>_min, STRIDE_LOOP, 2, 1, ain, aout };
+    ndfunc_t ndf_max = { <%=c_iter%>_max, STRIDE_LOOP, 2, 1, ain, aout };
+    ndfunc_t ndf_both = { <%=c_iter%>, STRIDE_LOOP, 3, 1, ain, aout };
+
+    if (RTEST(min)) {
+        if (RTEST(max)) {
+            return na_ndloop(&ndf_both, 3, self, min, max);
+        } else {
+            return na_ndloop(&ndf_min, 2, self, min);
+        }
+    } else {
+        if (RTEST(max)) {
+            return na_ndloop(&ndf_max, 2, self, max);
+        }
+    }
+    rb_raise(rb_eArgError,"min and max are not given");
+    return Qnil;
+}
diff --git a/ext/numo/narray/gen/tmpl/coerce_cast.c b/ext/numo/narray/gen/tmpl/coerce_cast.c
new file mode 100644
index 0000000..e799bad
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/coerce_cast.c
@@ -0,0 +1,10 @@
+/*
+  return NArray with cast to the type of self.
+  @overload coerce_cast(type)
+  @return [nil]
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE type)
+{
+    return Qnil;
+}
diff --git a/ext/numo/narray/gen/tmpl/cond_binary.c b/ext/numo/narray/gen/tmpl/cond_binary.c
new file mode 100644
index 0000000..8bee4e4
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/cond_binary.c
@@ -0,0 +1,55 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2;
+    BIT_DIGIT *a3;
+    size_t  p3;
+    ssize_t s1, s2, s3;
+    dtype   x, y;
+    BIT_DIGIT b;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR_BIT(lp, 2, a3, p3, s3);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,dtype,y);
+        b = (m_<%=name%>(x,y)) ? 1:0;
+        STORE_BIT(a3,p3,b);
+        p3+=s3;
+    }
+}
+
+static VALUE
+<%=c_func%>_self(VALUE self, VALUE other)
+{
+    ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{numo_cBit,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP, 2, 1, ain, aout };
+
+    return na_ndloop(&ndf, 2, self, other);
+}
+
+/*
+  Comparison <%=name%> other.
+  @overload <%=op_map%> other
+  @param [Numo::NArray,Numeric] other
+  @return [Numo::Bit] result of self <%=name%> other.
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE other)
+{
+    <% if is_object %>
+    return <%=c_func%>_self(self, other);
+    <% else %>
+    VALUE klass, v;
+    klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
+    if (klass==cT) {
+        return <%=c_func%>_self(self, other);
+    } else {
+        v = rb_funcall(klass, id_cast, 1, self);
+        return rb_funcall(v, <%=id_op%>, 1, other);
+    }
+    <% end %>
+}
diff --git a/ext/numo/narray/gen/tmpl/cond_unary.c b/ext/numo/narray/gen/tmpl/cond_unary.c
new file mode 100644
index 0000000..2d7a07a
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/cond_unary.c
@@ -0,0 +1,45 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t    i;
+    char     *p1;
+    BIT_DIGIT *a2;
+    size_t    p2;
+    ssize_t   s1, s2;
+    size_t   *idx1;
+    dtype     x;
+    BIT_DIGIT b;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_BIT(lp, 1, a2, p2, s2);
+    if (idx1) {
+        for (; i--;) {
+            GET_DATA_INDEX(p1,idx1,dtype,x);
+            b = (m_<%=name%>(x)) ? 1:0;
+            STORE_BIT(a2,p2,b);
+            p2+=s2;
+        }
+    } else {
+        for (; i--;) {
+            GET_DATA_STRIDE(p1,s1,dtype,x);
+            b = (m_<%=name%>(x)) ? 1:0;
+            STORE_BIT(a2,p2,b);
+            p2+=s2;
+        }
+    }
+}
+
+/*
+  Condition of <%=name%>.
+  @overload <%=name%>
+  @return [Numo::Bit] Condition of <%=name%>.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{numo_cBit,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 1, 1, ain, aout };
+
+    return na_ndloop(&ndf, 1, self);
+}
diff --git a/ext/numo/narray/gen/tmpl/cum.c b/ext/numo/narray/gen/tmpl/cum.c
new file mode 100644
index 0000000..6f589f8
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/cum.c
@@ -0,0 +1,49 @@
+<% (is_float ? ["","_nan"] : [""]).each do |j| %>
+static void
+<%=c_iter%><%=j%>(na_loop_t *const lp)
+{
+    size_t   i;
+    char    *p1, *p2;
+    ssize_t  s1, s2;
+    dtype    x, y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    //printf("i=%lu p1=%lx s1=%lu p2=%lx s2=%lu\n",i,(size_t)p1,s1,(size_t)p2,s2);
+
+    GET_DATA_STRIDE(p1,s1,dtype,x);
+    SET_DATA_STRIDE(p2,s2,dtype,x);
+    //printf("i=%lu x=%f\n",i,x);
+    for (i--; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,y);
+        m_<%=name%><%=j%>(x,y);
+        SET_DATA_STRIDE(p2,s2,dtype,x);
+        //printf("i=%lu x=%f\n",i,x);
+    }
+}
+<% end %>
+
+/*
+  <%=name%> of self.
+  @overload <%=name%>(axis:nil, nan:false)
+  @param [Numeric,Array,Range] axis  Affected dimensions.
+  @param [TrueClass] nan  If true, apply NaN-aware algorithm (avoid NaN if exists).
+  @return [Numo::<%=class_name%>] <%=name%> of self.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE reduce;
+    ndfunc_arg_in_t ain[2] = {{cT,0},{sym_reduce,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP|NDF_FLAT_REDUCE|NDF_CUM,
+                     2, 1, ain, aout };
+
+  <% if is_float %>
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, <%=c_iter%>_nan);
+  <% else %>
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+  <% end %>
+    return na_ndloop(&ndf, 2, self, reduce);
+}
diff --git a/ext/numo/narray/gen/tmpl/each.c b/ext/numo/narray/gen/tmpl/each.c
new file mode 100644
index 0000000..11a4c24
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/each.c
@@ -0,0 +1,43 @@
+void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t i, s1;
+    char *p1;
+    size_t *idx1;
+    dtype x;
+    VALUE y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    if (idx1) {
+        for (; i--;) {
+            GET_DATA_INDEX(p1,idx1,dtype,x);
+            y = m_data_to_num(x);
+            rb_yield(y);
+        }
+    } else {
+        for (; i--;) {
+            GET_DATA_STRIDE(p1,s1,dtype,x);
+            y = m_data_to_num(x);
+            rb_yield(y);
+        }
+    }
+}
+
+/*
+  Calls the given block once for each element in self,
+  passing that element as a parameter.
+  @overload <%=name%>
+  @return [Numo::NArray] self
+  For a block {|x| ... }
+  @yield [x]  x is element of NArray.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{Qnil,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 1,0, ain,0};
+
+    na_ndloop(&ndf, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/each_with_index.c b/ext/numo/narray/gen/tmpl/each_with_index.c
new file mode 100644
index 0000000..49fc9e5
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/each_with_index.c
@@ -0,0 +1,64 @@
+static inline void
+yield_each_with_index(dtype x, size_t *c, VALUE *a, int nd, int md)
+{
+    int j;
+
+    a[0] = m_data_to_num(x);
+    for (j=0; j<=nd; j++) {
+        a[j+1] = SIZET2NUM(c[j]);
+    }
+    rb_yield(rb_ary_new4(md,a));
+}
+
+
+void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t i, s1;
+    char *p1;
+    size_t *idx1;
+    dtype x;
+    VALUE *a;
+    size_t *c;
+    int nd, md;
+
+    c = (size_t*)(lp->opt_ptr);
+    nd = lp->ndim - 1;
+    md = lp->ndim + 1;
+    a = ALLOCA_N(VALUE,md);
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    c[nd] = 0;
+    if (idx1) {
+        for (; i--;) {
+            GET_DATA_INDEX(p1,idx1,dtype,x);
+            yield_each_with_index(x,c,a,nd,md);
+            c[nd]++;
+        }
+    } else {
+        for (; i--;) {
+            GET_DATA_STRIDE(p1,s1,dtype,x);
+            yield_each_with_index(x,c,a,nd,md);
+            c[nd]++;
+        }
+    }
+}
+
+/*
+  Invokes the given block once for each element of self,
+  passing that element and indices along each axis as parameters.
+  @overload <%=name%>
+  @return [Numo::NArray] self
+  For a block {|x,i,j,...| ... }
+  @yield [x,i,j,...]  x is an element, i,j,... are multidimensional indices.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{Qnil,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 1,0, ain,0};
+
+    na_ndloop_with_index(&ndf, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/extract.c b/ext/numo/narray/gen/tmpl/extract.c
new file mode 100644
index 0000000..7fe8944
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/extract.c
@@ -0,0 +1,23 @@
+/*
+  Extract an element only if self is a dimensionless NArray.
+  @overload extract
+  @return [Numeric,Numo::NArray]
+  --- Extract element value as Ruby Object if self is a dimensionless NArray,
+  otherwise returns self.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    volatile VALUE v;
+    char *ptr;
+    narray_t *na;
+    GetNArray(self,na);
+
+    if (na->ndim==0) {
+        ptr = na_get_pointer_for_read(self) + na_get_offset(self);
+        v = m_extract(ptr);
+        na_release_lock(self);
+        return v;
+    }
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/extract_data.c b/ext/numo/narray/gen/tmpl/extract_data.c
new file mode 100644
index 0000000..b64a39c
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/extract_data.c
@@ -0,0 +1,48 @@
+/*
+  Convert a data value of obj (with a single element) to dtype.
+*/
+static dtype
+<%=c_func(:nodef)%>(VALUE obj)
+{
+    narray_t *na;
+    dtype  x;
+    char  *ptr;
+    size_t pos;
+    VALUE  r, klass;
+
+    if (IsNArray(obj)) {
+        GetNArray(obj,na);
+        if (na->size != 1) {
+            rb_raise(nary_eShapeError,"narray size should be 1");
+        }
+        klass = CLASS_OF(obj);
+        ptr = na_get_pointer_for_read(obj);
+        pos = na_get_offset(obj);
+        <% find_tmpl("store").definitions.select{|x| x.class==Store}.each do |x| %>
+        if (<%=x.condition("klass")%>) {
+            <%=x.extract_data("ptr","pos","x")%>;
+            return x;
+        }
+        <% end %>
+
+        // coerce
+        r = rb_funcall(obj, rb_intern("coerce_cast"), 1, cT);
+        if (CLASS_OF(r)==cT) {
+            return <%=c_func%>(r);
+        }
+        <% if is_object %>
+        return obj;
+        <% else %>
+        rb_raise(nary_eCastError, "unknown conversion from %s to %s",
+                 rb_class2name(CLASS_OF(obj)),
+                 rb_class2name(cT));
+        <% end %>
+    }
+    if (TYPE(obj)==T_ARRAY) {
+        if (RARRAY_LEN(obj) != 1) {
+            rb_raise(nary_eShapeError,"array size should be 1");
+        }
+        return m_num_to_data(RARRAY_AREF(obj,0));
+    }
+    return m_num_to_data(obj);
+}
diff --git a/ext/numo/narray/gen/tmpl/eye.c b/ext/numo/narray/gen/tmpl/eye.c
new file mode 100644
index 0000000..b4b04f5
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/eye.c
@@ -0,0 +1,91 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   n0, n1;
+    size_t   i0, i1;
+    ssize_t  s0, s1;
+    char    *p0, *p1;
+    char    *g;
+    ssize_t kofs;
+    dtype   data;
+
+    g = (char*)(lp->opt_ptr);
+    kofs = *(ssize_t*)g;
+    data = *(dtype*)(g+sizeof(ssize_t));
+
+    n0 = lp->args[0].shape[0];
+    n1 = lp->args[0].shape[1];
+    s0 = lp->args[0].iter[0].step;
+    s1 = lp->args[0].iter[1].step;
+    p0 = NDL_PTR(lp,0);
+
+    for (i0=0; i0 < n0; i0++) {
+        p1 = p0;
+        for (i1=0; i1 < n1; i1++) {
+            *(dtype*)p1 = (i0+kofs==i1) ? data : m_zero;
+            p1 += s1;
+        }
+        p0 += s0;
+    }
+}
+
+/*
+  Eye: Set a value to diagonal components, set 0 to non-diagonal components.
+  @overload <%=name%>([element,offset])
+  @param [Numeric] element  Diagonal element to be stored. Default is 1.
+  @param [Integer] offset Diagonal offset from the main diagonal.  The
+      default is 0. k>0 for diagonals above the main diagonal, and k<0
+      for diagonals below the main diagonal.
+  @return [Numo::<%=class_name%>] <%=name%> of self.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{OVERWRITE,2}};
+    ndfunc_t ndf = {<%=c_iter%>, NO_LOOP, 1,0, ain,0};
+    ssize_t kofs;
+    dtype data;
+    char *g;
+    int nd;
+    narray_t *na;
+
+    // check arguments
+    if (argc > 2) {
+        rb_raise(rb_eArgError,"too many arguments (%d for 0..2)",argc);
+    } else if (argc == 2) {
+        data = m_num_to_data(argv[0]);
+        kofs = NUM2SSIZET(argv[1]);
+    } else if (argc == 1) {
+        data = m_num_to_data(argv[0]);
+        kofs = 0;
+    } else {
+        data = m_one;
+        kofs = 0;
+    }
+
+    GetNArray(self,na);
+    nd = na->ndim;
+    if (nd < 2) {
+        rb_raise(nary_eDimensionError,"less than 2-d array");
+    }
+
+    // Diagonal offset from the main diagonal.
+    if (kofs >= 0) {
+        if ((size_t)(kofs) >= na->shape[nd-1]) {
+            rb_raise(rb_eArgError,"invalid diagonal offset(%"SZF"d) for "
+                     "last dimension size(%"SZF"d)",kofs,na->shape[nd-1]);
+        }
+    } else {
+        if ((size_t)(-kofs) >= na->shape[nd-2]) {
+            rb_raise(rb_eArgError,"invalid diagonal offset(%"SZF"d) for "
+                     "last-1 dimension size(%"SZF"d)",kofs,na->shape[nd-2]);
+        }
+    }
+
+    g = ALLOCA_N(char,sizeof(ssize_t)+sizeof(dtype));
+    *(ssize_t*)g = kofs;
+    *(dtype*)(g+sizeof(ssize_t)) = data;
+
+    na_ndloop3(&ndf, g, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/fill.c b/ext/numo/narray/gen/tmpl/fill.c
new file mode 100644
index 0000000..6c26171
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/fill.c
@@ -0,0 +1,38 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   i;
+    char    *p1;
+    ssize_t  s1;
+    size_t  *idx1;
+    VALUE    x = lp->option;
+    dtype    y;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    y = m_num_to_data(x);
+    if (idx1) {
+        for (; i--;) {
+            SET_DATA_INDEX(p1,idx1,dtype,y);
+        }
+    } else {
+        for (; i--;) {
+            SET_DATA_STRIDE(p1,s1,dtype,y);
+        }
+    }
+}
+
+/*
+  Fill elements with other.
+  @overload <%=name%> other
+  @param [Numeric] other
+  @return [Numo::<%=class_name%>] self.
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE val)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{sym_option}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 2, 0, ain, 0 };
+
+    na_ndloop(&ndf, 2, self, val);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/format.c b/ext/numo/narray/gen/tmpl/format.c
new file mode 100644
index 0000000..e7ef9f0
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/format.c
@@ -0,0 +1,60 @@
+static VALUE
+format_<%=type_name%>(VALUE fmt, dtype* x)
+{
+    // fix-me
+    char s[48];
+    int n;
+
+    if (NIL_P(fmt)) {
+        n = m_sprintf(s,*x);
+        return rb_str_new(s,n);
+    }
+    return rb_funcall(fmt, '%', 1, m_data_to_num(*x));
+}
+
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2;
+    ssize_t s1, s2;
+    size_t *idx1;
+    dtype *x;
+    VALUE y;
+    VALUE fmt = lp->option;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR(lp, 1, p2, s2);
+    if (idx1) {
+        for (; i--;) {
+            x = (dtype*)(p1+*idx1); idx1++;
+            y = format_<%=type_name%>(fmt, x);
+            SET_DATA_STRIDE(p2, s2, VALUE, y);
+        }
+    } else {
+        for (; i--;) {
+            x = (dtype*)p1;         p1+=s1;
+            y = format_<%=type_name%>(fmt, x);
+            SET_DATA_STRIDE(p2, s2, VALUE, y);
+        }
+    }
+}
+
+/*
+  Format elements into strings.
+  @overload <%=name%> format
+  @param [String] format
+  @return [Numo::RObject] array of formated strings.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE fmt=Qnil;
+
+    ndfunc_arg_in_t ain[2] = {{Qnil,0},{sym_option}};
+    ndfunc_arg_out_t aout[1] = {{numo_cRObject,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP_NIP, 2, 1, ain, aout };
+
+    rb_scan_args(argc, argv, "01", &fmt);
+    return na_ndloop(&ndf, 2, self, fmt);
+}
diff --git a/ext/numo/narray/gen/tmpl/format_to_a.c b/ext/numo/narray/gen/tmpl/format_to_a.c
new file mode 100644
index 0000000..7676814
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/format_to_a.c
@@ -0,0 +1,47 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1;
+    ssize_t s1;
+    size_t *idx1;
+    dtype *x;
+    VALUE y;
+    volatile VALUE a;
+    VALUE fmt = lp->option;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    a = rb_ary_new2(i);
+    rb_ary_push(lp->args[1].value, a);
+    if (idx1) {
+        for (; i--;) {
+            x = (dtype*)(p1 + *idx1);  idx1++;
+            y = format_<%=type_name%>(fmt, x);
+            rb_ary_push(a,y);
+        }
+    } else {
+        for (; i--;) {
+            x = (dtype*)p1;  p1+=s1;
+            y = format_<%=type_name%>(fmt, x);
+            rb_ary_push(a,y);
+        }
+    }
+}
+
+/*
+  Format elements into strings.
+  @overload <%=name%> format
+  @param [String] format
+  @return [Array] array of formated strings.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    volatile VALUE fmt=Qnil;
+    ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
+    ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP_NIP, 3, 1, ain, aout };
+
+    rb_scan_args(argc, argv, "01", &fmt);
+    return na_ndloop_cast_narray_to_rarray(&ndf, self, fmt);
+}
diff --git a/ext/numo/narray/gen/tmpl/frexp.c b/ext/numo/narray/gen/tmpl/frexp.c
new file mode 100644
index 0000000..1dde9a2
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/frexp.c
@@ -0,0 +1,37 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   i;
+    char    *p1, *p2, *p3;
+    ssize_t  s1, s2, s3;
+    dtype    x;
+    int      y;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        x = m_<%=name%>(x,&y);
+        SET_DATA_STRIDE(p2,s2,dtype,x);
+        SET_DATA_STRIDE(p3,s3,int32_t,y);
+    }
+}
+
+/*
+  split the number x into a normalized fraction and an exponent.
+  Returns [mantissa, exponent], where x = mantissa * 2**exponent.
+
+  @overload <%=name%>(x)
+  @param [Numo::NArray,Numeric]  x
+  @return [Numo::<%=class_name%>,Numo::Int32]  mantissa and exponent.
+
+*/
+static VALUE
+<%=c_func(1)%>(VALUE mod, VALUE a1)
+{
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_arg_out_t aout[2] = {{cT,0},{numo_cInt32,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP, 1,2, ain,aout };
+    return na_ndloop(&ndf, 1, a1);
+}
diff --git a/ext/numo/narray/gen/tmpl/init_class.c b/ext/numo/narray/gen/tmpl/init_class.c
new file mode 100644
index 0000000..aa1d014
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/init_class.c
@@ -0,0 +1,20 @@
+    /*
+      Document-class: <%= full_class_name %>
+      <%= description %>
+    */
+    cT = rb_define_class_under(<%=ns_var%>, "<%=class_name%>", cNArray);
+
+  <% for x in class_alias %>
+    // alias of <%=class_name%>
+    rb_define_const(<%=ns_var%>, "<%=x%>", <%=type_var%>);
+  <% end %>
+
+    hCast = rb_hash_new();
+    rb_define_const(cT, "UPCAST", hCast);
+    rb_hash_aset(hCast, rb_cArray,   cT);
+    <% for x in upcast %>
+    <%= x %><% end %>
+
+    <% @children.each do |m| %>
+    <%= m.init_def %><% end %>
+    rb_define_singleton_method(cT, "[]", <%=find("cast").c_func%>, -2);
diff --git a/ext/numo/narray/gen/tmpl/init_module.c b/ext/numo/narray/gen/tmpl/init_module.c
new file mode 100644
index 0000000..17672f5
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/init_module.c
@@ -0,0 +1,12 @@
+    /*
+      Document-module: <%= full_module_name %>
+      <%= description %>
+    */
+    <%  if module_var != ns_var %>
+    <%=module_var%> = rb_define_module_under(<%=ns_var%>, "<%=module_name%>");
+    <%  end %>
+    <% @children.each do |m| %>
+    <%= m.init_def %><% end %>
+
+    //  how to do this?
+    //rb_extend_object(cT, mTM);
diff --git a/ext/numo/narray/gen/tmpl/inspect.c b/ext/numo/narray/gen/tmpl/inspect.c
new file mode 100644
index 0000000..d32afe3
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/inspect.c
@@ -0,0 +1,20 @@
+static VALUE
+<%=c_iter%>(char *ptr, size_t pos, VALUE fmt)
+{
+<% if is_object %>
+    return rb_inspect(*(VALUE*)(ptr+pos));
+<% else %>
+    return format_<%=type_name%>(fmt, (dtype*)(ptr+pos));
+<% end %>
+}
+
+/*
+  Returns a string containing a human-readable representation of NArray.
+  @overload inspect
+  @return [String]
+*/
+VALUE
+<%=c_func(0)%>(VALUE ary)
+{
+    return na_ndloop_inspect(ary, <%=c_iter%>, Qnil);
+}
diff --git a/ext/numo/narray/gen/tmpl/lib.c b/ext/numo/narray/gen/tmpl/lib.c
new file mode 100644
index 0000000..14354c3
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/lib.c
@@ -0,0 +1,45 @@
+/*
+  <%= file_name %>
+  Ruby/Numo::GSL - GSL wrapper for Ruby/Numo::NArray
+
+  created on: 2017-03-11
+  Copyright (C) 2017 Masahiro Tanaka
+*/
+
+#include <ruby.h>
+#include <assert.h>
+#include "numo/narray.h"
+#include "numo/template.h"
+#include "SFMT.h"
+
+#define m_map(x) m_num_to_data(rb_yield(m_data_to_num(x)))
+
+<% id_decl.each do |x| %>
+<%= x %>
+<% end %>
+
+<% include_files.each do |f| %>
+#include <<%=f%>>
+<% end %>
+
+VALUE cT;
+extern VALUE cRT;
+
+<% children.each do |c|%>
+<%= c.result+"\n\n" %>
+<% end %>
+
+void
+Init_<%=lib_name%>(void)
+{
+    VALUE hCast, <%=ns_var%>;
+
+    <%=ns_var%> = rb_define_module("Numo");
+
+    <% id_assign.each do |x| %>
+    <%= x %><% end %>
+
+<% children.each do |c| %>
+<%= c.init_def %>
+<% end %>
+}
diff --git a/ext/numo/narray/gen/tmpl/logseq.c b/ext/numo/narray/gen/tmpl/logseq.c
new file mode 100644
index 0000000..84930c8
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/logseq.c
@@ -0,0 +1,82 @@
+typedef struct {
+    seq_data_t beg;
+    seq_data_t step;
+    seq_data_t base;
+    seq_count_t count;
+} logseq_opt_t;
+
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1;
+    ssize_t s1;
+    size_t *idx1;
+    dtype   x;
+    seq_data_t beg, step, base;
+    seq_count_t c;
+    logseq_opt_t *g;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    g = (logseq_opt_t*)(lp->opt_ptr);
+    beg  = g->beg;
+    step = g->step;
+    base = g->base;
+    c    = g->count;
+    if (idx1) {
+        for (; i--;) {
+            x = f_seq(beg,step,c++);
+            *(dtype*)(p1+*idx1) = m_pow(base,x);
+            idx1++;
+        }
+    } else {
+        for (; i--;) {
+            x = f_seq(beg,step,c++);
+            *(dtype*)(p1) = m_pow(base,x);
+            p1 += s1;
+        }
+    }
+    g->count = c;
+}
+
+/*
+  Set logarithmic sequence of numbers to self. The sequence is obtained from
+     base**(beg+i*step)
+  where i is 1-dimensional index.
+  Applicable classes: DFloat, SFloat, DComplex, SCopmplex.
+
+  @overload logseq(beg,step,[base])
+  @param [Numeric] beg  The begining of sequence.
+  @param [Numeric] step  The step of sequence.
+  @param [Numeric] base  The base of log space. (default=10)
+  @return [Numo::<%=class_name%>] self.
+
+  @example
+    Numo::DFloat.new(5).logseq(4,-1,2)
+    => Numo::DFloat#shape=[5]
+      [16, 8, 4, 2, 1]
+    Numo::DComplex.new(5).logseq(0,1i*Math::PI/3,Math::E)
+    => Numo::DComplex#shape=[5]
+      [1+7.26156e-310i, 0.5+0.866025i, -0.5+0.866025i, -1+1.22465e-16i, ...]
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *args, VALUE self)
+{
+    logseq_opt_t *g;
+    VALUE vbeg, vstep, vbase;
+    ndfunc_arg_in_t ain[1] = {{OVERWRITE,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 1,0, ain,0};
+
+    g = ALLOCA_N(logseq_opt_t,1);
+    rb_scan_args(argc, args, "21", &vbeg, &vstep, &vbase);
+    g->beg = m_num_to_data(vbeg);
+    g->step = m_num_to_data(vstep);
+    if (vbase==Qnil) {
+        g->base = m_from_real(10);
+    } else {
+        g->base = m_num_to_data(vbase);
+    }
+    na_ndloop3(&ndf, g, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/map_with_index.c b/ext/numo/narray/gen/tmpl/map_with_index.c
new file mode 100644
index 0000000..58c55f5
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/map_with_index.c
@@ -0,0 +1,94 @@
+static inline dtype
+yield_map_with_index(dtype x, size_t *c, VALUE *a, int nd, int md)
+{
+    int j;
+    VALUE y;
+
+    a[0] = m_data_to_num(x);
+    for (j=0; j<=nd; j++) {
+        a[j+1] = SIZET2NUM(c[j]);
+    }
+    y = rb_yield(rb_ary_new4(md,a));
+    return m_num_to_data(y);
+}
+
+void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2;
+    ssize_t s1, s2;
+    size_t *idx1, *idx2;
+    dtype x;
+    VALUE *a;
+    size_t *c;
+    int nd, md;
+
+    c = (size_t*)(lp->opt_ptr);
+    nd = lp->ndim - 1;
+    md = lp->ndim + 1;
+    a = ALLOCA_N(VALUE,md);
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+
+    c[nd] = 0;
+    if (idx1) {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA_INDEX(p1,idx1,dtype,x);
+                x = yield_map_with_index(x,c,a,nd,md);
+                SET_DATA_INDEX(p2,idx2,dtype,x);
+                c[nd]++;
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_INDEX(p1,idx1,dtype,x);
+                x = yield_map_with_index(x,c,a,nd,md);
+                SET_DATA_STRIDE(p2,s2,dtype,x);
+                c[nd]++;
+            }
+        }
+    } else {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA_STRIDE(p1,s1,dtype,x);
+                x = yield_map_with_index(x,c,a,nd,md);
+                SET_DATA_INDEX(p2,idx2,dtype,x);
+                c[nd]++;
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_STRIDE(p1,s1,dtype,x);
+                x = yield_map_with_index(x,c,a,nd,md);
+                SET_DATA_STRIDE(p2,s2,dtype,x);
+                c[nd]++;
+            }
+        }
+    }
+}
+
+/*
+  Invokes the given block once for each element of self,
+  passing that element and indices along each axis as parameters.
+  Creates a new NArray containing the values returned by the block.
+  Inplace option is allowed, i.e., `nary.inplace.map` overwrites `nary`.
+
+  @overload <%=name%>
+
+  For a block {|x,i,j,...| ... }
+  @yield [x,i,j,...]  x is an element, i,j,... are multidimensional indices.
+
+  @return [Numo::NArray] mapped array
+
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{Qnil,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 1,1, ain,aout};
+
+    return na_ndloop_with_index(&ndf, 1, self);
+}
diff --git a/ext/numo/narray/gen/tmpl/median.c b/ext/numo/narray/gen/tmpl/median.c
new file mode 100644
index 0000000..bf821b9
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/median.c
@@ -0,0 +1,64 @@
+<% (is_float ? ["_ignan","_prnan"] : [""]).each do |j| %>
+static void
+<%=c_iter%><%=j%>(na_loop_t *const lp)
+{
+    size_t n;
+    char *p1, *p2;
+    dtype *buf;
+
+    INIT_COUNTER(lp, n);
+    p1 = (lp->args[0]).ptr + (lp->args[0].iter[0]).pos;
+    p2 = (lp->args[1]).ptr + (lp->args[1].iter[0]).pos;
+    buf = (dtype*)p1;
+
+    <%=type_name%>_qsort<%=j%>(buf, n, sizeof(dtype));
+
+    <% if is_float %>
+    for (; n; n--) {
+        if (!isnan(buf[n-1])) break;
+    }
+    <% end %>
+
+    if (n==0) {
+        *(dtype*)p2 = buf[0];
+    }
+    else if (n%2==0) {
+        *(dtype*)p2 = (buf[n/2-1]+buf[n/2])/2;
+    }
+    else {
+        *(dtype*)p2 = buf[(n-1)/2];
+    }
+}
+<% end %>
+
+/*
+  <%=name%> of self.
+<% if is_float %>
+  @overload <%=name%>(axis:nil, keepdims:false, nan:false)
+  @param [TrueClass] nan (keyword) If true, propagete NaN. If false, ignore NaN.
+<% else %>
+  @overload <%=name%>(axis:nil, keepdims:false)
+<% end %>
+  @param [Numeric,Array,Range] axis (keyword) Affected dimensions.
+  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
+  @return [Numo::<%=class_name%>] returns <%=name%> of self.
+*/
+
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE reduce;
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{sym_reduce,0}};
+    ndfunc_arg_out_t aout[1] = {{INT2FIX(0),0}};
+    ndfunc_t ndf = {0, NDF_HAS_LOOP|NDF_FLAT_REDUCE, 2,1, ain,aout};
+
+    self = na_copy(self); // as temporary buffer
+  <% if is_float %>
+    ndf.func = <%=c_iter%>_ignan;
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, <%=c_iter%>_prnan);
+  <% else %>
+    ndf.func = <%=c_iter%>;
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+  <% end %>
+    return na_ndloop(&ndf, 2, self, reduce);
+}
diff --git a/ext/numo/narray/gen/tmpl/minmax.c b/ext/numo/narray/gen/tmpl/minmax.c
new file mode 100644
index 0000000..e15b56d
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/minmax.c
@@ -0,0 +1,46 @@
+<% (is_float ? ["","_nan"] : [""]).each do |j| %>
+static void
+<%=c_iter%><%=j%>(na_loop_t *const lp)
+{
+    size_t   n;
+    char    *p1;
+    ssize_t  s1;
+    dtype    xmin,xmax;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR(lp, 0, p1, s1);
+
+    f_<%=name%><%=j%>(n,p1,s1,&xmin,&xmax);
+
+    *(dtype*)(lp->args[1].ptr + lp->args[1].iter[0].pos) = xmin;
+    *(dtype*)(lp->args[2].ptr + lp->args[2].iter[0].pos) = xmax;
+}
+<% end %>
+
+/*
+  <%=name%> of self.
+<% if is_float %>
+  @overload <%=name%>(axis:nil, keepdims:false, nan:false)
+  @param [TrueClass] nan  If true, apply NaN-aware algorithm (return NaN if exist).
+<% else %>
+  @overload <%=name%>(axis:nil, keepdims:false)
+<% end %>
+  @param [Numeric,Array,Range] axis (keyword) Affected dimensions.
+  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
+  @return [Numo::<%=class_name%>,Numo::<%=class_name%>] min and max of self.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE reduce;
+    ndfunc_arg_in_t ain[2] = {{cT,0},{sym_reduce,0}};
+    ndfunc_arg_out_t aout[2] = {{cT,0},{cT,0}};
+    ndfunc_t ndf = {<%=c_iter%>, STRIDE_LOOP_NIP|NDF_FLAT_REDUCE|NDF_EXTRACT, 2,2, ain,aout};
+
+  <% if is_float %>
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, <%=c_iter%>_nan);
+  <% else %>
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+  <% end %>
+    return na_ndloop(&ndf, 2, self, reduce);
+}
diff --git a/ext/numo/narray/gen/tmpl/module.c b/ext/numo/narray/gen/tmpl/module.c
new file mode 100644
index 0000000..5d066bb
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/module.c
@@ -0,0 +1,9 @@
+/*
+  module definition: <%= full_module_name %>
+*/
+
+<%  if module_var != ns_var %>
+VALUE <%=module_var%>;
+<%  end %>
+
+<%= method_code %>
diff --git a/ext/numo/narray/gen/tmpl/new_dim0.c b/ext/numo/narray/gen/tmpl/new_dim0.c
new file mode 100644
index 0000000..0fe8a40
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/new_dim0.c
@@ -0,0 +1,12 @@
+static VALUE
+<%=c_func(:nodef)%>(dtype x)
+{
+    VALUE v;
+    dtype *ptr;
+
+    v = nary_new(cT, 0, NULL);
+    ptr = (dtype*)(char*)na_get_pointer_for_write(v);
+    *ptr = x;
+    na_release_lock(v);
+    return v;
+}
diff --git a/ext/numo/narray/gen/tmpl/poly.c b/ext/numo/narray/gen/tmpl/poly.c
new file mode 100644
index 0000000..f2282c5
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/poly.c
@@ -0,0 +1,49 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    dtype  x, y, a;
+
+    x = *(dtype*)(lp->args[0].ptr + lp->args[0].iter[0].pos);
+    i = lp->narg - 2;
+    y = *(dtype*)(lp->args[i].ptr + lp->args[i].iter[0].pos);
+    for (; --i;) {
+        y = m_mul(x,y);
+        a = *(dtype*)(lp->args[i].ptr + lp->args[i].iter[0].pos);
+        y = m_add(y,a);
+    }
+    i = lp->narg - 1;
+    *(dtype*)(lp->args[i].ptr + lp->args[i].iter[0].pos) = y;
+}
+
+/*
+  Polynomial.: a0 + a1*x + a2*x**2 + a3*x**3 + ... + an*x**n
+  @overload <%=name%> a0, a1, ...
+  @param [Numo::NArray,Numeric] a0
+  @param [Numo::NArray,Numeric] a1 , ...
+  @return [Numo::<%=class_name%>]
+*/
+static VALUE
+<%=c_func(-2)%>(VALUE self, VALUE args)
+{
+    int argc, i;
+    VALUE *argv;
+    volatile VALUE v, a;
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, NO_LOOP, 0, 1, 0, aout };
+
+    argc = RARRAY_LEN(args);
+    ndf.nin = argc+1;
+    ndf.ain = ALLOCA_N(ndfunc_arg_in_t,argc+1);
+    for (i=0; i<argc+1; i++) {
+        ndf.ain[i].type = cT;
+    }
+    argv = ALLOCA_N(VALUE,argc+1);
+    argv[0] = self;
+    for (i=0; i<argc; i++) {
+        argv[i+1] = RARRAY_PTR(args)[i];
+    }
+    a = rb_ary_new4(argc+1, argv);
+    v = na_ndloop2(&ndf, a);
+    return <%=type_name%>_extract(v);
+}
diff --git a/ext/numo/narray/gen/tmpl/pow.c b/ext/numo/narray/gen/tmpl/pow.c
new file mode 100644
index 0000000..0d43bc6
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/pow.c
@@ -0,0 +1,78 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char    *p1, *p2, *p3;
+    ssize_t s1, s2, s3;
+    dtype    x, y;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,dtype,y);
+        x = m_pow(x,y);
+        SET_DATA_STRIDE(p3,s3,dtype,x);
+    }
+}
+
+static void
+<%=c_iter%>_int32(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2, *p3;
+    ssize_t s1, s2, s3;
+    dtype   x;
+    int32_t y;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        GET_DATA_STRIDE(p2,s2,int32_t,y);
+        x = m_pow_int(x,y);
+        SET_DATA_STRIDE(p3,s3,dtype,x);
+    }
+}
+
+static VALUE
+<%=c_func%>_self(VALUE self, VALUE other)
+{
+    ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
+    ndfunc_arg_in_t ain_i[2] = {{cT,0},{numo_cInt32,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP, 2, 1, ain, aout };
+    ndfunc_t ndf_i = { <%=c_iter%>_int32, STRIDE_LOOP, 2, 1, ain_i, aout };
+
+    // fixme : use na.integer?
+    if (FIXNUM_P(other) || rb_obj_is_kind_of(other,numo_cInt32)) {
+        return na_ndloop(&ndf_i, 2, self, other);
+    } else {
+        return na_ndloop(&ndf, 2, self, other);
+    }
+}
+
+/*
+  Binary power.
+  @overload <%=op_map%> other
+  @param [Numo::NArray,Numeric] other
+  @return [Numo::NArray] self to the other-th power.
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE other)
+{
+    <% if is_object %>
+    return <%=c_func%>_self(self,other);
+    <% else %>
+    VALUE klass, v;
+    klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
+    if (klass==cT) {
+        return <%=c_func%>_self(self,other);
+    } else {
+        v = rb_funcall(klass, id_cast, 1, self);
+        return rb_funcall(v, id_pow, 1, other);
+    }
+    <% end %>
+}
diff --git a/ext/numo/narray/gen/tmpl/powint.c b/ext/numo/narray/gen/tmpl/powint.c
new file mode 100644
index 0000000..2125ad3
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/powint.c
@@ -0,0 +1,17 @@
+static dtype pow_<%=type_name%>(dtype x, int p)
+{
+    dtype r = m_one;
+    switch(p) {
+    case 2: return m_square(x);
+    case 3: return m_mul(m_square(x),x);
+    case 1: return x;
+    case 0: return m_one;
+    }
+    if (p<0)  return m_zero;
+    while (p) {
+        if ((p%2) == 1) r = m_mul(r,x);
+        x = m_square(x);
+        p /= 2;
+    }
+    return r;
+}
diff --git a/ext/numo/narray/gen/tmpl/qsort.c b/ext/numo/narray/gen/tmpl/qsort.c
new file mode 100644
index 0000000..2be8b1d
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/qsort.c
@@ -0,0 +1,150 @@
+/*
+  qsort.c
+  Numerical Array Extension for Ruby
+    modified by Masahiro TANAKA
+*/
+
+/*
+ *      qsort.c: standard quicksort algorithm
+ *
+ *      Modifications from vanilla NetBSD source:
+ *        Add do ... while() macro fix
+ *        Remove __inline, _DIAGASSERTs, __P
+ *        Remove ill-considered "swap_cnt" switch to insertion sort,
+ *        in favor of a simple check for presorted input.
+ *
+ *      CAUTION: if you change this file, see also qsort_arg.c
+ *
+ *      $PostgreSQL: pgsql/src/port/qsort.c,v 1.12 2006/10/19 20:56:22 tgl Exp $
+ */
+
+/*      $NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $   */
+
+/*-
+ * Copyright (c) 1992, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *        may be used to endorse or promote products derived from this software
+ *        without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.      IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef QSORT_INCL
+#define QSORT_INCL
+#define Min(x, y)               ((x) < (y) ? (x) : (y))
+
+#define swap(type,a,b) \
+    do {type tmp=*(type*)(a); *(type*)(a)=*(type*)(b); *(type*)(b)=tmp;} while(0)
+
+#define vecswap(type, a, b, n) if ((n)>0) swap(type,(a),(b))
+
+#define MED3(a,b,c)                                     \
+    (cmpgt(b,a) ?                                       \
+     (cmpgt(c,b) ? b : (cmpgt(c,a) ? c : a))            \
+     : (cmpgt(b,c) ? b : (cmpgt(c,a) ? a : c)))
+#endif
+
+#undef qsort_dtype
+#define qsort_dtype <%=dtype%>
+#undef qsort_cast
+#define qsort_cast <%=dcast%>
+<% if "#{suffix}" != "" %>
+#undef cmp
+#undef cmpgt
+#define cmp(a,b) cmp<%=suffix%>(a,b)
+#define cmpgt(a,b) cmpgt<%=suffix%>(a,b)
+<% end %>
+<% c_func(:nodef)%>
+
+void
+<%=type_name%>_qsort<%=suffix%>(void *a, size_t n, ssize_t es)
+{
+    char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+    int  d, r, presorted;
+
+ loop:
+    if (n < 7) {
+        for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+            for (pl = pm; pl > (char *) a && cmpgt(pl - es, pl);
+                 pl -= es)
+                swap(qsort_dtype, pl, pl - es);
+        return;
+    }
+    presorted = 1;
+    for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) {
+        if (cmpgt(pm - es, pm)) {
+            presorted = 0;
+            break;
+        }
+    }
+    if (presorted)
+        return;
+    pm = (char *) a + (n / 2) * es;
+    if (n > 7) {
+        pl = (char *) a;
+        pn = (char *) a + (n - 1) * es;
+        if (n > 40) {
+            d = (n / 8) * es;
+            pl = MED3(pl, pl + d, pl + 2 * d);
+            pm = MED3(pm - d, pm, pm + d);
+            pn = MED3(pn - 2 * d, pn - d, pn);
+        }
+        pm = MED3(pl, pm, pn);
+    }
+    swap(qsort_dtype, a, pm);
+    pa = pb = (char *) a + es;
+    pc = pd = (char *) a + (n - 1) * es;
+    for (;;) {
+        while (pb <= pc && (r = cmp(pb, a)) <= 0) {
+            if (r == 0) {
+                swap(qsort_dtype, pa, pb);
+                pa += es;
+            }
+            pb += es;
+        }
+        while (pb <= pc && (r = cmp(pc, a)) >= 0) {
+            if (r == 0) {
+                swap(qsort_dtype, pc, pd);
+                pd -= es;
+            }
+            pc -= es;
+        }
+        if (pb > pc)
+            break;
+        swap(qsort_dtype, pb, pc);
+        pb += es;
+        pc -= es;
+    }
+    pn = (char *) a + n * es;
+    r = Min(pa - (char *) a, pb - pa);
+    vecswap(qsort_dtype, a, pb - r, r);
+    r = Min(pd - pc, pn - pd - es);
+    vecswap(qsort_dtype, pb, pn - r, r);
+    if ((r = pb - pa) > es)
+        <%=type_name%>_qsort<%=suffix%>(a, r / es, es);
+    if ((r = pd - pc) > es) {
+        a = pn - r;
+        n = r / es;
+        goto loop;
+    }
+}
diff --git a/ext/numo/narray/gen/tmpl/rand.c b/ext/numo/narray/gen/tmpl/rand.c
new file mode 100644
index 0000000..d3ede6b
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/rand.c
@@ -0,0 +1,165 @@
+<%
+if is_int && !is_object
+  if /Int64$/ =~ class_name
+    rand_bit = 64
+  else
+    rand_bit = 32
+  end
+  m_rand = "m_rand(max,shift)"
+  shift_def = "int shift;"
+  shift_set = "shift = #{rand_bit-1} - msb_pos(max);"
+  rand_type = "uint#{rand_bit}_t"
+%>
+
+#define HWID (sizeof(dtype)*4)
+
+static int msb_pos(<%=rand_type%> a)
+{
+    int width = HWID;
+    int pos = 0;
+    <%=rand_type%> mask = (((dtype)1 << HWID)-1) << HWID;
+
+    if (a==0) {return -1;}
+
+    while (width) {
+        if (a & mask) {
+            pos += width;
+        } else {
+            mask >>= width;
+        }
+        width >>= 1;
+        mask &= mask << width;
+    }
+    return pos;
+}
+
+/* generates a random number on [0,max) */
+<% if rand_bit == 64 %>
+inline static dtype m_rand(uint64_t max, int shift)
+{
+    uint64_t x;
+    do {
+        x = gen_rand32();
+        x <<= 32;
+        x |= gen_rand32();
+        x >>= shift;
+    } while (x >= max);
+    return x;
+}
+<% else %>
+inline static dtype m_rand(uint32_t max, int shift)
+{
+    uint32_t x;
+    do {
+        x = gen_rand32();
+        x >>= shift;
+    } while (x >= max);
+    return x;
+}
+<% end %>
+<%
+else
+  m_rand = "m_rand(max)"
+  shift_def = ""
+  shift_set = ""
+  rand_type = "dtype"
+end
+%>
+
+typedef struct {
+    dtype low;
+    <%=rand_type%> max;
+} rand_opt_t;
+
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   i;
+    char    *p1;
+    ssize_t  s1;
+    size_t  *idx1;
+    dtype    x;
+    rand_opt_t *g;
+    dtype    low;
+    <%=rand_type%> max;
+    <%=shift_def%>
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    g = (rand_opt_t*)(lp->opt_ptr);
+    low = g->low;
+    max = g->max;
+    <%=shift_set%>
+
+    if (idx1) {
+        for (; i--;) {
+            x = m_add(<%=m_rand%>,low);
+            SET_DATA_INDEX(p1,idx1,dtype,x);
+        }
+    } else {
+        for (; i--;) {
+            x = m_add(<%=m_rand%>,low);
+            SET_DATA_STRIDE(p1,s1,dtype,x);
+        }
+    }
+}
+
+
+/*
+  Generate uniformly distributed random numbers on self narray.
+  @overload rand([[low],high])
+  @param [Numeric] low  lower inclusive boundary of random numbers. (default=0)
+  @param [Numeric] high  upper exclusive boundary of random numbers. (default=1 or 1+1i for complex types)
+  @return [Numo::<%=class_name%>] self.
+  @example
+    Numo::DFloat.new(6).rand
+    => Numo::DFloat#shape=[6]
+       [0.0617545, 0.373067, 0.794815, 0.201042, 0.116041, 0.344032]
+    Numo::DComplex.new(6).rand(5+5i)
+    => Numo::DComplex#shape=[6]
+       [2.69974+3.68908i, 0.825443+0.254414i, 0.540323+0.34354i, 4.52061+2.39322i, ...]
+    Numo::Int32.new(6).rand(2,5)
+    => Numo::Int32#shape=[6]
+       [4, 3, 3, 2, 4, 2]
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *args, VALUE self)
+{
+    rand_opt_t g;
+    VALUE v1=Qnil, v2=Qnil;
+    dtype high;
+    ndfunc_arg_in_t ain[1] = {{OVERWRITE,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 1,0, ain,0};
+
+    <% if is_int && !is_object %>
+    rb_scan_args(argc, args, "11", &v1, &v2);
+    if (v2==Qnil) {
+        g.low = m_zero;
+        g.max = high = m_num_to_data(v1);
+    <% else %>
+    rb_scan_args(argc, args, "02", &v1, &v2);
+    if (v2==Qnil) {
+        g.low = m_zero;
+        if (v1==Qnil) {
+            <% if is_complex %>
+            g.max = high = c_new(1,1);
+            <% else %>
+            g.max = high = m_one;
+            <% end %>
+        } else {
+            g.max = high = m_num_to_data(v1);
+        }
+    <% end %>
+    } else {
+        g.low = m_num_to_data(v1);
+        high = m_num_to_data(v2);
+        g.max = m_sub(high,g.low);
+    }
+    <% if is_int && !is_object %>
+    if (high <= g.low) {
+        rb_raise(rb_eArgError,"high must be larger than low");
+    }
+    <% end %>
+    na_ndloop3(&ndf, &g, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/rand_norm.c b/ext/numo/narray/gen/tmpl/rand_norm.c
new file mode 100644
index 0000000..da2431c
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/rand_norm.c
@@ -0,0 +1,119 @@
+typedef struct {
+    dtype mu;
+    rtype sigma;
+} randn_opt_t;
+
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   i;
+    char    *p1;
+    ssize_t  s1;
+    size_t  *idx1;
+    <% if is_complex %>
+    dtype   *a0;
+    <% else %>
+    dtype   *a0, *a1;
+    <% end %>
+    dtype    mu;
+    rtype    sigma;
+    randn_opt_t *g;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    g = (randn_opt_t*)(lp->opt_ptr);
+    mu = g->mu;
+    sigma = g->sigma;
+
+    if (idx1) {
+        <% if is_complex %>
+        for (; i--;) {
+            a0 = (dtype*)(p1+*idx1);
+            m_rand_norm(mu,sigma,a0);
+            idx1 += 1;
+        }
+        <% else %>
+        for (; i>1; i-=2) {
+            a0 = (dtype*)(p1+*idx1);
+            a1 = (dtype*)(p1+*(idx1+1));
+            m_rand_norm(mu,sigma,a0,a1);
+            idx1 += 2;
+        }
+        if (i>0) {
+            a0 = (dtype*)(p1+*idx1);
+            m_rand_norm(mu,sigma,a0,0);
+        }
+        <% end %>
+    } else {
+        <% if is_complex %>
+        for (; i--;) {
+            a0 = (dtype*)(p1);
+            m_rand_norm(mu,sigma,a0);
+            p1 += s1;
+        }
+        <% else %>
+        for (; i>1; i-=2) {
+            a0 = (dtype*)(p1);
+            a1 = (dtype*)(p1+s1);
+            m_rand_norm(mu,sigma,a0,a1);
+            p1 += s1*2;
+        }
+        if (i>0) {
+            a0 = (dtype*)(p1);
+            m_rand_norm(mu,sigma,a0,0);
+        }
+        <% end %>
+    }
+}
+
+/*
+  Generates random numbers from the normal distribution on self narray
+  using Box-Muller Transformation.
+  @overload rand_norm([mu,[sigma]])
+  @param [Numeric] mu  mean of normal distribution. (default=0)
+  @param [Numeric] sigma  standard deviation of normal distribution. (default=1)
+  @return [Numo::<%=class_name%>] self.
+  @example
+    Numo::DFloat.new(5,5).rand_norm
+    => Numo::DFloat#shape=[5,5]
+       [[-0.581255, -0.168354, 0.586895, -0.595142, -0.802802],
+        [-0.326106, 0.282922, 1.68427, 0.918499, -0.0485384],
+        [-0.464453, -0.992194, 0.413794, -0.60717, -0.699695],
+        [-1.64168, 0.48676, -0.875871, -1.43275, 0.812172],
+        [-0.209975, -0.103612, -0.878617, -1.42495, 1.0968]]
+    Numo::DFloat.new(5,5).rand_norm(10,0.1)
+    => Numo::DFloat#shape=[5,5]
+       [[9.9019, 9.90339, 10.0826, 9.98384, 9.72861],
+        [9.81507, 10.0272, 9.91445, 10.0568, 9.88923],
+        [10.0234, 9.97874, 9.96011, 9.9006, 9.99964],
+        [10.0186, 9.94598, 9.92236, 9.99811, 9.97003],
+        [9.79266, 9.95044, 9.95212, 9.93692, 10.2027]]
+    Numo::DComplex.new(3,3).rand_norm(5+5i)
+    => Numo::DComplex#shape=[3,3]
+       [[5.84303+4.40052i, 4.00984+6.08982i, 5.10979+5.13215i],
+        [4.26477+3.99655i, 4.90052+5.00763i, 4.46607+2.3444i],
+        [4.5528+7.11003i, 5.62117+6.69094i, 5.05443+5.35133i]]
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *args, VALUE self)
+{
+    int n;
+    randn_opt_t g;
+    VALUE v1=Qnil, v2=Qnil;
+    ndfunc_arg_in_t ain[1] = {{OVERWRITE,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 1,0, ain,0};
+
+    n = rb_scan_args(argc, args, "02", &v1, &v2);
+    if (n == 0) {
+        g.mu = m_zero;
+    } else {
+        g.mu = m_num_to_data(v1);
+    }
+    if (n == 2) {
+        g.sigma = NUM2DBL(v2);
+    } else {
+        g.sigma = 1;
+    }
+    na_ndloop3(&ndf, &g, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/seq.c b/ext/numo/narray/gen/tmpl/seq.c
new file mode 100644
index 0000000..300c321
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/seq.c
@@ -0,0 +1,92 @@
+<% if is_int && !is_object %>
+typedef double seq_data_t;
+<% else %>
+typedef dtype seq_data_t;
+<% end %>
+
+<% if is_object %>
+typedef size_t seq_count_t;
+<% else %>
+typedef double seq_count_t;
+<% end %>
+
+typedef struct {
+    seq_data_t beg;
+    seq_data_t step;
+    seq_count_t count;
+} seq_opt_t;
+
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1;
+    ssize_t s1;
+    size_t *idx1;
+    dtype   x;
+    seq_data_t beg, step;
+    seq_count_t c;
+    seq_opt_t *g;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    g = (seq_opt_t*)(lp->opt_ptr);
+    beg  = g->beg;
+    step = g->step;
+    c    = g->count;
+    if (idx1) {
+        for (; i--;) {
+            x = f_seq(beg,step,c++);
+            *(dtype*)(p1+*idx1) = x;
+            idx1++;
+        }
+    } else {
+        for (; i--;) {
+            x = f_seq(beg,step,c++);
+            *(dtype*)(p1) = x;
+            p1 += s1;
+        }
+    }
+    g->count = c;
+}
+
+/*
+  Set linear sequence of numbers to self. The sequence is obtained from
+     beg+i*step
+  where i is 1-dimensional index.
+  @overload seq([beg,[step]])
+  @param [Numeric] beg  begining of sequence. (default=0)
+  @param [Numeric] step  step of sequence. (default=1)
+  @return [Numo::<%=class_name%>] self.
+  @example
+    Numo::DFloat.new(6).seq(1,-0.2)
+    => Numo::DFloat#shape=[6]
+       [1, 0.8, 0.6, 0.4, 0.2, 0]
+    Numo::DComplex.new(6).seq(1,-0.2+0.2i)
+    => Numo::DComplex#shape=[6]
+       [1+0i, 0.8+0.2i, 0.6+0.4i, 0.4+0.6i, 0.2+0.8i, 0+1i]
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *args, VALUE self)
+{
+    seq_opt_t *g;
+    VALUE vbeg=Qnil, vstep=Qnil;
+    ndfunc_arg_in_t ain[1] = {{OVERWRITE,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 1,0, ain,0};
+
+    g = ALLOCA_N(seq_opt_t,1);
+    g->beg = m_zero;
+    g->step = m_one;
+    g->count = 0;
+    rb_scan_args(argc, args, "02", &vbeg, &vstep);
+<% if is_int && !is_object %>
+    if (vbeg!=Qnil) {g->beg = NUM2DBL(vbeg);}
+    if (vstep!=Qnil) {g->step = NUM2DBL(vstep);}
+<% else %>
+    if (vbeg!=Qnil) {g->beg = m_num_to_data(vbeg);}
+    if (vstep!=Qnil) {g->step = m_num_to_data(vstep);}
+<% end %>
+
+    na_ndloop3(&ndf, g, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/set2.c b/ext/numo/narray/gen/tmpl/set2.c
new file mode 100644
index 0000000..2240979
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/set2.c
@@ -0,0 +1,56 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2;
+    ssize_t s1, s2;
+    size_t *idx1, *idx2;
+    dtype   x;
+    <%=dtype%> y;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+    if (idx1) {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA(p1+*idx1,dtype,x);
+                GET_DATA_INDEX(p2,idx2,<%=dtype%>,y);
+                x = m_<%=name%>(x,y);
+                SET_DATA_INDEX(p1,idx1,dtype,x);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA(p1+*idx1,dtype,x);
+                GET_DATA_STRIDE(p2,s2,<%=dtype%>,y);
+                x = m_<%=name%>(x,y);
+                SET_DATA_INDEX(p1,idx1,dtype,x);
+            }
+        }
+    } else {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA(p1,dtype,x);
+                GET_DATA_INDEX(p2,idx2,<%=dtype%>,y);
+                x = m_<%=name%>(x,y);
+                SET_DATA_STRIDE(p1,s1,dtype,x);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA(p1,dtype,x);
+                GET_DATA_STRIDE(p2,s2,<%=dtype%>,y);
+                x = m_<%=name%>(x,y);
+                SET_DATA_STRIDE(p1,s1,dtype,x);
+            }
+        }
+    }
+}
+
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE a1)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{<%=result_class%>,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 2, 0, ain, 0 };
+
+    na_ndloop(&ndf, 2, self, a1);
+    return a1;
+}
diff --git a/ext/numo/narray/gen/tmpl/sort.c b/ext/numo/narray/gen/tmpl/sort.c
new file mode 100644
index 0000000..b9f580c
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/sort.c
@@ -0,0 +1,47 @@
+<% (is_float ? ["_ignan","_prnan"] : [""]).each do |j| %>
+static void
+<%=c_iter%><%=j%>(na_loop_t *const lp)
+{
+    size_t n;
+    char *ptr;
+    ssize_t step;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR(lp, 0, ptr, step);
+    <%=type_name%>_qsort<%=j%>(ptr, n, step);
+}
+<% end %>
+
+/*
+  <%=name%> of self.
+<% if is_float %>
+  @overload <%=name%>(axis:nil, nan:false)
+  @param [TrueClass] nan  If true, propagete NaN. If false, ignore NaN.
+<% else %>
+  @overload <%=name%>(axis:nil)
+<% end %>
+  @param [Numeric,Array,Range] axis  Affected dimensions.
+  @return [Numo::<%=class_name%>] returns result of <%=name%>.
+  @example
+      Numo::DFloat[3,4,1,2].sort => Numo::DFloat[1,2,3,4]
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE reduce;
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{sym_reduce,0}};
+    ndfunc_t ndf = {0, STRIDE_LOOP|NDF_FLAT_REDUCE, 2,0, ain,0};
+
+    if (!TEST_INPLACE(self)) {
+        self = na_copy(self);
+    }
+  <% if is_float %>
+    ndf.func = <%=c_iter%>_ignan;
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, <%=c_iter%>_prnan);
+  <% else %>
+    ndf.func = <%=c_iter%>;
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+  <% end %>
+    na_ndloop(&ndf, 2, self, reduce);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/sort_index.c b/ext/numo/narray/gen/tmpl/sort_index.c
new file mode 100644
index 0000000..39dc160
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/sort_index.c
@@ -0,0 +1,102 @@
+<% (is_float ? ["_ignan","_prnan"] : [""]).each do |j|
+   [64,32].each do |i| %>
+#define idx_t int<%=i%>_t
+static void
+<%=type_name%>_index<%=i%>_qsort<%=j%>(na_loop_t *const lp)
+{
+    size_t   i, n, idx;
+    char    *d_ptr, *i_ptr, *o_ptr;
+    ssize_t  d_step, i_step, o_step;
+    char   **ptr;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR(lp, 0, d_ptr, d_step);
+    INIT_PTR(lp, 1, i_ptr, i_step);
+    INIT_PTR(lp, 2, o_ptr, o_step);
+
+    ptr = (char**)(lp->opt_ptr);
+
+    //printf("(ptr=%lx, d_ptr=%lx,d_step=%ld, i_ptr=%lx,i_step=%ld, o_ptr=%lx,o_step=%ld)\n",(size_t)ptr,(size_t)d_ptr,(ssize_t)d_step,(size_t)i_ptr,(ssize_t)i_step,(size_t)o_ptr,(ssize_t)o_step);
+
+    for (i=0; i<n; i++) {
+        ptr[i] = d_ptr + d_step * i;
+        //printf("(%ld,%.3f)",i,*(double*)ptr[i]);
+    }
+
+    <%=type_name%>_index_qsort<%=j%>(ptr, n, sizeof(dtype*));
+
+    //d_ptr = lp->args[0].ptr;
+    //printf("(d_ptr=%lx)\n",(size_t)d_ptr);
+
+    for (i=0; i<n; i++) {
+        idx = (ptr[i] - d_ptr) / d_step;
+        *(idx_t*)o_ptr = *(idx_t*)(i_ptr + i_step * idx);
+        //printf("(idx[%ld]=%ld,%d)",i,idx,*(idx_t*)o_ptr);
+        o_ptr += o_step;
+    }
+    //printf("\n");
+}
+#undef idx_t
+<% end;end %>
+
+/*
+  <%=name%>. Returns an index array of sort result.
+<% if is_float %>
+  @overload <%=name%>(axis:nil, nan:false)
+  @param [TrueClass] nan  If true, propagete NaN. If false, ignore NaN.
+<% else %>
+  @overload <%=name%>(axis:nil)
+<% end %>
+  @param [Numeric,Array,Range] axis  Affected dimensions.
+  @return [Integer,Numo::Int] returns result index of <%=name%>.
+  @example
+      Numo::NArray[3,4,1,2].sort_index => Numo::Int32[2,3,0,1]
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    size_t size;
+    narray_t *na;
+    VALUE idx, tmp, reduce, res;
+    char *buf;
+    ndfunc_arg_in_t ain[3] = {{cT,0},{0,0},{sym_reduce,0}};
+    ndfunc_arg_out_t aout[1] = {{0,0,0}};
+    ndfunc_t ndf = {0, STRIDE_LOOP_NIP|NDF_FLAT_REDUCE|NDF_CUM, 3,1, ain,aout};
+
+    GetNArray(self,na);
+    if (na->ndim==0) {
+        return INT2FIX(0);
+    }
+    if (na->size > (~(u_int32_t)0)) {
+        ain[1].type =
+        aout[0].type = numo_cInt64;
+        idx = nary_new(numo_cInt64, na->ndim, na->shape);
+       <% if is_float %>
+         ndf.func = <%=type_name%>_index64_qsort_ignan;
+         reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf,
+                                      <%=type_name%>_index64_qsort_prnan);
+       <% else %>
+         ndf.func = <%=type_name%>_index64_qsort;
+         reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+       <% end %>
+    } else {
+        ain[1].type =
+        aout[0].type = numo_cInt32;
+        idx = nary_new(numo_cInt32, na->ndim, na->shape);
+       <% if is_float %>
+         ndf.func = <%=type_name%>_index32_qsort_ignan;
+         reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf,
+                                      <%=type_name%>_index32_qsort_prnan);
+       <% else %>
+         ndf.func = <%=type_name%>_index32_qsort;
+         reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+       <% end %>
+    }
+    rb_funcall(idx, rb_intern("seq"), 0);
+
+    size = na->size*sizeof(void*);
+    buf = rb_alloc_tmp_buffer(&tmp, size);
+    res = na_ndloop3(&ndf, buf, 3, self, idx, reduce);
+    rb_free_tmp_buffer(&tmp);
+    return res;
+}
diff --git a/ext/numo/narray/gen/tmpl/store.c b/ext/numo/narray/gen/tmpl/store.c
new file mode 100644
index 0000000..1cdda16
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/store.c
@@ -0,0 +1,41 @@
+<% children.each do |c|%>
+<%= c.result %>
+
+<% end %>
+/*
+  Store elements to Numo::<%=class_name%> from other.
+  @overload store(other)
+  @param [Object] other
+  @return [Numo::<%=class_name%>] self
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE obj)
+{
+    VALUE r, klass;
+
+    klass = CLASS_OF(obj);
+
+    <% definitions.each do |x| %>
+    if (<%=x.condition("klass")%>) {
+        <%=x.c_func%>(self,obj);
+        return self;
+    }
+    <% end %>
+
+    if (IsNArray(obj)) {
+        r = rb_funcall(obj, rb_intern("coerce_cast"), 1, cT);
+        if (CLASS_OF(r)==cT) {
+            <%=c_func%>(self,r);
+            return self;
+        }
+    }
+
+    <% if is_object %>
+    robject_store_numeric(self,obj);
+    <% else %>
+    rb_raise(nary_eCastError, "unknown conversion from %s to %s",
+             rb_class2name(CLASS_OF(obj)),
+             rb_class2name(CLASS_OF(self)));
+    <% end %>
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/store_array.c b/ext/numo/narray/gen/tmpl/store_array.c
new file mode 100644
index 0000000..40cb3b5
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/store_array.c
@@ -0,0 +1,102 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t i, n;
+    size_t i1, n1;
+    VALUE  v1, *ptr;
+    char   *p1;
+    size_t s1, *idx1;
+    VALUE  x;
+    double y;
+    dtype  z;
+    size_t len, c;
+    double beg, step;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    v1 = lp->args[1].value;
+    i = 0;
+
+    if (lp->args[1].ptr) {
+        if (v1 == Qtrue) {
+            iter_<%=type_name%>_store_<%=type_name%>(lp);
+            i = lp->args[1].shape[0];
+            if (idx1) {
+                idx1 += i;
+            } else {
+                p1 += s1 * i;
+            }
+        }
+        goto loop_end;
+    }
+
+    ptr = &v1;
+
+    switch(TYPE(v1)) {
+    case T_ARRAY:
+        n1 = RARRAY_LEN(v1);
+        ptr = RARRAY_PTR(v1);
+        break;
+    case T_NIL:
+        n1 = 0;
+        break;
+    default:
+        n1 = 1;
+    }
+
+    if (idx1) {
+        for (i=i1=0; i1<n1 && i<n; i++,i1++) {
+            x = ptr[i1];
+            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, na_cStep)) {
+                nary_step_sequence(x,&len,&beg,&step);
+                for (c=0; c<len && i<n; c++,i++) {
+                    y = beg + step * c;
+                    z = m_from_double(y);
+                    SET_DATA_INDEX(p1, idx1, dtype, z);
+                }
+            }
+            else if (TYPE(x) != T_ARRAY) {
+                z = m_num_to_data(x);
+                SET_DATA_INDEX(p1, idx1, dtype, z);
+            }
+        }
+    } else {
+        for (i=i1=0; i1<n1 && i<n; i++,i1++) {
+            x = ptr[i1];
+            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, na_cStep)) {
+                nary_step_sequence(x,&len,&beg,&step);
+                for (c=0; c<len && i<n; c++,i++) {
+                    y = beg + step * c;
+                    z = m_from_double(y);
+                    SET_DATA_STRIDE(p1, s1, dtype, z);
+                }
+            }
+            else if (TYPE(x) != T_ARRAY) {
+                z = m_num_to_data(x);
+                SET_DATA_STRIDE(p1, s1, dtype, z);
+            }
+        }
+    }
+
+ loop_end:
+    z = m_zero;
+    if (idx1) {
+        for (; i<n; i++) {
+            SET_DATA_INDEX(p1, idx1, dtype, z);
+        }
+    } else {
+        for (; i<n; i++) {
+            SET_DATA_STRIDE(p1, s1, dtype, z);
+        }
+    }
+}
+
+static VALUE
+<%=c_func%>(VALUE self, VALUE rary)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{rb_cArray,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2, 0, ain, 0};
+
+    na_ndloop_store_rarray(&ndf, self, rary);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/store_bit.c b/ext/numo/narray/gen/tmpl/store_bit.c
new file mode 100644
index 0000000..2422497
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/store_bit.c
@@ -0,0 +1,55 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t     i;
+    char      *p1;
+    size_t     p2;
+    ssize_t    s1, s2;
+    size_t    *idx1, *idx2;
+    BIT_DIGIT *a2, x;
+    dtype      y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
+    if (idx2) {
+        if (idx1) {
+            for (; i--;) {
+                LOAD_BIT(a2, p2+*idx2, x); idx2++;
+                y = m_from_real(x);
+                SET_DATA_INDEX(p1,idx1,dtype,y);
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a2, p2+*idx2, x); idx2++;
+                y = m_from_real(x);
+                SET_DATA_STRIDE(p1,s1,dtype,y);
+            }
+        }
+    } else {
+        if (idx1) {
+            for (; i--;) {
+                LOAD_BIT(a2, p2, x); p2 += s2;
+                y = m_from_real(x);
+                SET_DATA_INDEX(p1,idx1,dtype,y);
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a2, p2, x); p2 += s2;
+                y = m_from_real(x);
+                SET_DATA_STRIDE(p1,s1,dtype,y);
+            }
+        }
+    }
+}
+
+
+static VALUE
+<%=c_func(:nodef)%>(VALUE self, VALUE obj)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{Qnil,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2,0, ain,0};
+
+    na_ndloop(&ndf, 2, self, obj);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/store_from.c b/ext/numo/narray/gen/tmpl/store_from.c
new file mode 100644
index 0000000..bdbef70
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/store_from.c
@@ -0,0 +1,53 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i, s1, s2;
+    char   *p1, *p2;
+    size_t *idx1, *idx2;
+    <%=dtype%> x;
+    dtype y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+    if (idx2) {
+        if (idx1) {
+            for (; i--;) {
+                GET_DATA_INDEX(p2,idx2,<%=dtype%>,x);
+                y = <%=macro%>(x);
+                SET_DATA_INDEX(p1,idx1,dtype,y);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_INDEX(p2,idx2,<%=dtype%>,x);
+                y = <%=macro%>(x);
+                SET_DATA_STRIDE(p1,s1,dtype,y);
+            }
+        }
+    } else {
+        if (idx1) {
+            for (; i--;) {
+                GET_DATA_STRIDE(p2,s2,<%=dtype%>,x);
+                y = <%=macro%>(x);
+                SET_DATA_INDEX(p1,idx1,dtype,y);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_STRIDE(p2,s2,<%=dtype%>,x);
+                y = <%=macro%>(x);
+                SET_DATA_STRIDE(p1,s1,dtype,y);
+            }
+        }
+    }
+}
+
+
+static VALUE
+<%=c_func(:nodef)%>(VALUE self, VALUE obj)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{Qnil,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 2, 0, ain, 0 };
+
+    na_ndloop(&ndf, 2, self, obj);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/store_numeric.c b/ext/numo/narray/gen/tmpl/store_numeric.c
new file mode 100644
index 0000000..5f770c3
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/store_numeric.c
@@ -0,0 +1,9 @@
+static VALUE
+<%=c_func(:nodef)%>(VALUE self, VALUE obj)
+{
+    dtype x;
+    x = m_num_to_data(obj);
+    obj = <%=type_name%>_new_dim0(x);
+    <%=parent.c_func%>(self,obj);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl/to_a.c b/ext/numo/narray/gen/tmpl/to_a.c
new file mode 100644
index 0000000..6032a6b
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/to_a.c
@@ -0,0 +1,41 @@
+void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t i, s1;
+    char *p1;
+    size_t *idx1;
+    dtype x;
+    volatile VALUE a, y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    a = rb_ary_new2(i);
+    rb_ary_push(lp->args[1].value, a);
+    if (idx1) {
+        for (; i--;) {
+            GET_DATA_INDEX(p1,idx1,dtype,x);
+            y = m_data_to_num(x);
+            rb_ary_push(a,y);
+        }
+    } else {
+        for (; i--;) {
+            GET_DATA_STRIDE(p1,s1,dtype,x);
+            y = m_data_to_num(x);
+            rb_ary_push(a,y);
+        }
+    }
+}
+
+/*
+  Convert self to Array.
+  @overload <%=name%>
+  @return [Array]
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
+    ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP_NIP, 3, 1, ain, aout };
+    return na_ndloop_cast_narray_to_rarray(&ndf, self, Qnil);
+}
diff --git a/ext/numo/narray/gen/tmpl/unary.c b/ext/numo/narray/gen/tmpl/unary.c
new file mode 100644
index 0000000..382c638
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/unary.c
@@ -0,0 +1,58 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2;
+    ssize_t s1, s2;
+    size_t *idx1, *idx2;
+    dtype   x;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+
+    if (idx1) {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA_INDEX(p1,idx1,dtype,x);
+                x = m_<%=name%>(x);
+                SET_DATA_INDEX(p2,idx2,dtype,x);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_INDEX(p1,idx1,dtype,x);
+                x = m_<%=name%>(x);
+                SET_DATA_STRIDE(p2,s2,dtype,x);
+            }
+        }
+    } else {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA_STRIDE(p1,s1,dtype,x);
+                x = m_<%=name%>(x);
+                SET_DATA_INDEX(p2,idx2,dtype,x);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_STRIDE(p1,s1,dtype,x);
+                x = m_<%=name%>(x);
+                SET_DATA_STRIDE(p2,s2,dtype,x);
+            }
+        }
+    }
+}
+
+/*
+  Unary <%=name%>.
+  @overload <%=op_map%>
+  @return [Numo::<%=class_name%>] <%=name%> of self.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 1,1, ain,aout};
+
+    return na_ndloop(&ndf, 1, self);
+}
diff --git a/ext/numo/narray/gen/tmpl/unary2.c b/ext/numo/narray/gen/tmpl/unary2.c
new file mode 100644
index 0000000..58c239a
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/unary2.c
@@ -0,0 +1,58 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2;
+    ssize_t s1, s2;
+    size_t *idx1, *idx2;
+    dtype   x;
+    <%=dtype%> y;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+    if (idx1) {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA_INDEX(p1,idx1,dtype,x);
+                y = m_<%=name%>(x);
+                SET_DATA_INDEX(p2,idx2,<%=dtype%>,y);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_INDEX(p1,idx1,dtype,x);
+                y = m_<%=name%>(x);
+                SET_DATA_STRIDE(p2,s2,<%=dtype%>,y);
+            }
+        }
+    } else {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA_STRIDE(p1,s1,dtype,x);
+                y = m_<%=name%>(x);
+                SET_DATA_INDEX(p2,idx2,<%=dtype%>,y);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_STRIDE(p1,s1,dtype,x);
+                y = m_<%=name%>(x);
+                SET_DATA_STRIDE(p2,s2,<%=dtype%>,y);
+            }
+        }
+    }
+}
+
+
+/*
+  <%=name%> of self.
+  @overload <%=name%>
+  @return [Numo::<%=real_class_name%>] <%=name%> of self.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{<%=result_class%>,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 1, 1, ain, aout };
+
+    return na_ndloop(&ndf, 1, self);
+}
diff --git a/ext/numo/narray/gen/tmpl/unary_ret2.c b/ext/numo/narray/gen/tmpl/unary_ret2.c
new file mode 100644
index 0000000..feba990
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/unary_ret2.c
@@ -0,0 +1,33 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2, *p3;
+    ssize_t s1, s2, s3;
+    dtype   x, y, z;
+    INIT_COUNTER(lp, i);
+    INIT_PTR(lp, 0, p1, s1);
+    INIT_PTR(lp, 1, p2, s2);
+    INIT_PTR(lp, 2, p3, s3);
+    for (; i--;) {
+        GET_DATA_STRIDE(p1,s1,dtype,x);
+        m_<%=name%>(x,y,z);
+        SET_DATA_STRIDE(p2,s2,dtype,y);
+        SET_DATA_STRIDE(p3,s3,dtype,z);
+    }
+}
+
+/*
+  <%=name%> of self.
+  @overload <%=name%>
+  @return [Numo::<%=real_class_name%>] <%=name%> of self.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_arg_out_t aout[2] = {{cT,0},{cT,0}};
+    ndfunc_t ndf = {<%=c_iter%>, STRIDE_LOOP, 1,2, ain,aout};
+
+    return na_ndloop(&ndf, 1, self);
+}
diff --git a/ext/numo/narray/gen/tmpl/unary_s.c b/ext/numo/narray/gen/tmpl/unary_s.c
new file mode 100644
index 0000000..6ec8707
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl/unary_s.c
@@ -0,0 +1,57 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    char   *p1, *p2;
+    ssize_t s1, s2;
+    size_t *idx1, *idx2;
+    dtype   x;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+    if (idx1) {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA_INDEX(p1,idx1,dtype,x);
+                x = m_<%=name%>(x);
+                SET_DATA_INDEX(p2,idx2,dtype,x);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_INDEX(p1,idx1,dtype,x);
+                x = m_<%=name%>(x);
+                SET_DATA_STRIDE(p2,s2,dtype,x);
+            }
+        }
+    } else {
+        if (idx2) {
+            for (; i--;) {
+                GET_DATA_STRIDE(p1,s1,dtype,x);
+                x = m_<%=name%>(x);
+                SET_DATA_INDEX(p2,idx2,dtype,x);
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_STRIDE(p1,s1,dtype,x);
+                x = m_<%=name%>(x);
+                SET_DATA_STRIDE(p2,s2,dtype,x);
+            }
+        }
+    }
+}
+
+/*
+  Calculate <%=name%>(x).
+  @overload <%=name%>(x)
+  @param [Numo::NArray,Numeric] x  input value
+  @return [Numo::<%=class_name%>] result of <%=name%>(x).
+*/
+static VALUE
+<%=c_func(1)%>(VALUE mod, VALUE a1)
+{
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 1, 1, ain, aout };
+
+    return na_ndloop(&ndf, 1, a1);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/allocate.c b/ext/numo/narray/gen/tmpl_bit/allocate.c
new file mode 100644
index 0000000..8759328
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/allocate.c
@@ -0,0 +1,24 @@
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    narray_t *na;
+    char *ptr;
+
+    GetNArray(self,na);
+
+    switch(NA_TYPE(na)) {
+    case NARRAY_DATA_T:
+        ptr = NA_DATA_PTR(na);
+        if (na->size > 0 && ptr == NULL) {
+            ptr = xmalloc(((na->size-1)/8/sizeof(BIT_DIGIT)+1)*sizeof(BIT_DIGIT));
+            NA_DATA_PTR(na) = ptr;
+        }
+        break;
+    case NARRAY_VIEW_T:
+        rb_funcall(NA_VIEW_DATA(na), rb_intern("allocate"), 0);
+        break;
+    default:
+        rb_raise(rb_eRuntimeError,"invalid narray type");
+    }
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/aref.c b/ext/numo/narray/gen/tmpl_bit/aref.c
new file mode 100644
index 0000000..adb1d93
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/aref.c
@@ -0,0 +1,55 @@
+/*
+  Array element referenece or slice view.
+  @overload [](dim0,...,dimL)
+  @param [Numeric,Range,etc] dim0,...,dimL  Multi-dimensional Index.
+  @return [Numeric,NArray::<%=class_name%>] Element object or NArray view.
+
+  --- Returns the element at +dim0+, +dim1+, ... are Numeric indices
+  for each dimension, or returns a NArray View as a sliced subarray if
+  +dim0+, +dim1+, ... includes other than Numeric index, e.g., Range
+  or Array or true.
+
+  @example
+      a = Numo::DFloat.new(4,5).seq
+      => Numo::DFloat#shape=[4,5]
+      [[0, 1, 2, 3, 4],
+       [5, 6, 7, 8, 9],
+       [10, 11, 12, 13, 14],
+       [15, 16, 17, 18, 19]]
+
+      a[1,1]
+      => 6.0
+
+      a[1..3,1]
+      => Numo::DFloat#shape=[3]
+      [6, 11, 16]
+
+      a[1,[1,3,4]]
+      => Numo::DFloat#shape=[3]
+      [6, 8, 9]
+
+      a[true,2].fill(99)
+      a
+      => Numo::DFloat#shape=[4,5]
+      [[0, 1, 99, 3, 4],
+       [5, 6, 99, 8, 9],
+       [10, 11, 99, 13, 14],
+       [15, 16, 99, 18, 19]]
+ */
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    int nd;
+    size_t pos;
+    char *ptr;
+    dtype x;
+
+    nd = na_get_result_dimension(self, argc, argv, 1, &pos);
+    if (nd) {
+        return na_aref_main(argc, argv, self, 0, nd);
+    } else {
+        ptr = na_get_pointer_for_read(self);
+        LOAD_BIT(ptr,pos,x);
+        return m_data_to_num(x);
+    }
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/aset.c b/ext/numo/narray/gen/tmpl_bit/aset.c
new file mode 100644
index 0000000..9b33c5c
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/aset.c
@@ -0,0 +1,65 @@
+/*
+  Array element(s) set.
+  @overload []=(dim0,..,dimL,val)
+  @param [Numeric,Range,etc] dim0,..,dimL  Multi-dimensional Index.
+  @param [Numeric,Numo::NArray,etc] val  Value(s) to be set to self.
+  @return [Numeric] returns val (last argument).
+
+  --- Replace element(s) at +dim0+, +dim1+, ... (index/range/array/true
+  for each dimention). Broadcasting mechanism is applied.
+
+  @example
+      a = Numo::DFloat.new(3,4).seq
+      => Numo::DFloat#shape=[3,4]
+      [[0, 1, 2, 3],
+       [4, 5, 6, 7],
+       [8, 9, 10, 11]]
+
+      a[1,2]=99
+      a
+      => Numo::DFloat#shape=[3,4]
+      [[0, 1, 2, 3],
+       [4, 5, 99, 7],
+       [8, 9, 10, 11]]
+
+      a[1,[0,2]] = [101,102]
+      a
+      => Numo::DFloat#shape=[3,4]
+      [[0, 1, 2, 3],
+       [101, 5, 102, 7],
+       [8, 9, 10, 11]]
+
+      a[1,true]=99
+      a
+      => Numo::DFloat#shape=[3,4]
+      [[0, 1, 2, 3],
+       [99, 99, 99, 99],
+       [8, 9, 10, 11]]
+
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    int nd;
+    size_t pos;
+    char *ptr;
+    VALUE a;
+    dtype x;
+
+    argc--;
+    if (argc==0) {
+        <%=c_func.sub(/_aset/,"_store")%>(self, argv[argc]);
+    } else {
+        nd = na_get_result_dimension(self, argc, argv, 1, &pos);
+        if (nd) {
+            a = na_aref_main(argc, argv, self, 0, nd);
+            <%=c_func.sub(/_aset/,"_store")%>(a, argv[argc]);
+        } else {
+            x = <%=type_name%>_extract_data(argv[argc]);
+            ptr = na_get_pointer_for_read_write(self);
+            STORE_BIT(ptr,pos,x);
+        }
+
+    }
+    return argv[argc];
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/binary.c b/ext/numo/narray/gen/tmpl_bit/binary.c
new file mode 100644
index 0000000..2eccc26
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/binary.c
@@ -0,0 +1,94 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  n;
+    size_t  p1, p2, p3;
+    ssize_t s1, s2, s3;
+    size_t *idx1, *idx2, *idx3;
+    int     o1, o2, l1, l2, r1, r2, len;
+    BIT_DIGIT *a1, *a2, *a3;
+    BIT_DIGIT  x, y;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
+    INIT_PTR_BIT_IDX(lp, 2, a3, p3, s3, idx3);
+    if (s1!=1 || s2!=1 || s3!=1 || idx1 || idx2 || idx3) {
+        for (; n--;) {
+            LOAD_BIT_STEP(a1, p1, s1, idx1, x);
+            LOAD_BIT_STEP(a2, p2, s2, idx2, y);
+            x = m_<%=name%>(x,y);
+            STORE_BIT_STEP(a3, p3, s3, idx3, x);
+        }
+    } else {
+        o1 =  p1 % NB;
+        o1 -= p3;
+        o2 =  p2 % NB;
+        o2 -= p3;
+        l1 =  NB+o1;
+        r1 =  NB-o1;
+        l2 =  NB+o2;
+        r2 =  NB-o2;
+        if (p3>0 || n<NB) {
+            len = NB - p3;
+            if ((int)n<len) len=n;
+            if (o1>=0) x = *a1>>o1;
+            else       x = *a1<<-o1;
+            if (p1+len>NB)  x |= *(a1+1)<<r1;
+            a1++;
+            if (o2>=0) y = *a2>>o2;
+            else       y = *a2<<-o2;
+            if (p2+len>NB)  y |= *(a2+1)<<r2;
+            a2++;
+            x = m_<%=name%>(x,y);
+            *a3 = (x & (SLB(len)<<p3)) | (*a3 & ~(SLB(len)<<p3));
+            a3++;
+            n -= len;
+        }
+        if (o1==0 && o2==0) {
+            for (; n>=NB; n-=NB) {
+                x = *(a1++);
+                y = *(a2++);
+                x = m_<%=name%>(x,y);
+                *(a3++) = x;
+            }
+        } else {
+            for (; n>=NB; n-=NB) {
+                x = *a1>>o1;
+                if (o1<0)  x |= *(a1-1)>>l1;
+                if (o1>0)  x |= *(a1+1)<<r1;
+                a1++;
+                y = *a2>>o2;
+                if (o2<0)  y |= *(a2-1)>>l2;
+                if (o2>0)  y |= *(a2+1)<<r2;
+                a2++;
+                x = m_<%=name%>(x,y);
+                *(a3++) = x;
+            }
+        }
+        if (n>0) {
+            x = *a1>>o1;
+            if (o1<0)  x |= *(a1-1)>>l1;
+            y = *a2>>o2;
+            if (o2<0)  y |= *(a2-1)>>l2;
+            x = m_<%=name%>(x,y);
+            *a3 = (x & SLB(n)) | (*a3 & BALL<<n);
+        }
+    }
+}
+
+/*
+  Binary <%=name%>.
+  @overload <%=op_map%> other
+  @param [Numo::NArray,Numeric] other
+  @return [Numo::NArray] <%=name%> of self and other.
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE other)
+{
+    ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 2, 1, ain, aout };
+
+    return na_ndloop(&ndf, 2, self, other);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/bit_count.c b/ext/numo/narray/gen/tmpl_bit/bit_count.c
new file mode 100644
index 0000000..48ab41e
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/bit_count.c
@@ -0,0 +1,85 @@
+#undef int_t
+#define int_t int64_t
+
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    BIT_DIGIT *a1;
+    size_t  p1;
+    char   *p2;
+    ssize_t s1, s2;
+    size_t *idx1;
+    BIT_DIGIT x=0;
+    int_t   y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    INIT_PTR(lp, 1, p2, s2);
+    if (s2==0) {
+        GET_DATA(p2, int_t, y);
+        if (idx1) {
+            for (; i--;) {
+                LOAD_BIT(a1, p1+*idx1, x);
+                idx1++;
+                if (m_<%=name%>(x)) {
+                    y++;
+                }
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a1, p1, x);
+                p1 += s1;
+                if (m_<%=name%>(x)) {
+                    y++;
+                }
+            }
+        }
+        *(int_t*)p2 = y;
+    } else {
+        if (idx1) {
+            for (; i--;) {
+                LOAD_BIT(a1, p1+*idx1, x);
+                idx1++;
+                if (m_<%=name%>(x)) {
+                    GET_DATA(p2, int_t, y);
+                    y++;
+                    SET_DATA(p2, int_t, y);
+                }
+                p2+=s2;
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a1, p1, x);
+                p1+=s1;
+                if (m_<%=name%>(x)) {
+                    GET_DATA(p2, int_t, y);
+                    y++;
+                    SET_DATA(p2, int_t, y);
+                }
+                p2+=s2;
+            }
+        }
+    }
+}
+
+/*
+  Returns the number of bits.
+  If argument is supplied, return Int-array counted along the axes.
+  @overload <%=op_map%>(axis:nil, keepdims:false)
+  @param [Integer,Array,Range] axis (keyword) axes to be counted.
+  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
+  @return [Numo::Int64]
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE v, reduce;
+    ndfunc_arg_in_t ain[3] = {{cT,0},{sym_reduce,0},{sym_init,0}};
+    ndfunc_arg_out_t aout[1] = {{numo_cInt64,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP_NIP, 3, 1, ain, aout };
+
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+    v = na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
+    return rb_funcall(v,rb_intern("extract"),0);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/bit_reduce.c b/ext/numo/narray/gen/tmpl_bit/bit_reduce.c
new file mode 100644
index 0000000..2b1d1df
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/bit_reduce.c
@@ -0,0 +1,129 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t     i;
+    BIT_DIGIT *a1, *a2;
+    size_t     p1,  p2;
+    ssize_t    s1,  s2;
+    size_t    *idx1, *idx2;
+    BIT_DIGIT  x=0, y=0;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
+    if (idx2) {
+        if (idx1) {
+            for (; i--;) {
+                LOAD_BIT(a2, p2+*idx2, y);
+                if (y == <%=init_bit%>) {
+                    LOAD_BIT(a1, p1+*idx1, x);
+                    if (x != <%=init_bit%>) {
+                        STORE_BIT(a2, p2+*idx2, x);
+                    }
+                }
+                idx1++;
+                idx2++;
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a2, p2+*idx2, y);
+                if (y == <%=init_bit%>) {
+                    LOAD_BIT(a1, p1, x);
+                    if (x != <%=init_bit%>) {
+                        STORE_BIT(a2, p2+*idx2, x);
+                    }
+                }
+                p1 += s1;
+                idx2++;
+            }
+        }
+    } else if (s2) {
+        if (idx1) {
+            for (; i--;) {
+                LOAD_BIT(a2, p2, y);
+                if (y == <%=init_bit%>) {
+                    LOAD_BIT(a1, p1+*idx1, x);
+                    if (x != <%=init_bit%>) {
+                        STORE_BIT(a2, p2, x);
+                    }
+                }
+                idx1++;
+                p2 += s2;
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a2, p2, y);
+                if (y == <%=init_bit%>) {
+                    LOAD_BIT(a1, p1, x);
+                    if (x != <%=init_bit%>) {
+                        STORE_BIT(a2, p2, x);
+                    }
+                }
+                p1 += s1;
+                p2 += s2;
+            }
+        }
+    } else {
+        LOAD_BIT(a2, p2, x);
+        if (x != <%=init_bit%>) {
+            return;
+        }
+        if (idx1) {
+            for (; i--;) {
+                LOAD_BIT(a1, p1+*idx1, y);
+                if (y != <%=init_bit%>) {
+                    STORE_BIT(a2, p2, y);
+                    return;
+                }
+                idx1++;
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a1, p1, y);
+                if (y != <%=init_bit%>) {
+                    STORE_BIT(a2, p2, y);
+                    return;
+                }
+                p1 += s1;
+            }
+        }
+    }
+}
+
+/*
+<% case name
+   when /^any/ %>
+  Return true if any of bits is one (true).
+<% when /^all/ %>
+  Return true if all of bits are one (true).
+<% end %>
+  If argument is supplied, return Bit-array reduced along the axes.
+  @overload <%=op_map%>(axis:nil, keepdims:false)
+  @param [Integer,Array,Range] axis (keyword) axes to be reduced.
+  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
+  @return [Numo::Bit] .
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE v, reduce;
+    ndfunc_arg_in_t ain[3] = {{cT,0},{sym_reduce,0},{sym_init,0}};
+    ndfunc_arg_out_t aout[1] = {{numo_cBit,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 3,1, ain,aout};
+
+    reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
+    v = na_ndloop(&ndf, 3, self, reduce, INT2FIX(<%=init_bit%>));
+    if (argc > 0) {
+        return v;
+    }
+    v = <%=find_tmpl("extract").c_func%>(v);
+    switch (v) {
+    case INT2FIX(0):
+        return Qfalse;
+    case INT2FIX(1):
+        return Qtrue;
+    default:
+        rb_bug("unexpected result");
+        return v;
+    }
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/each.c b/ext/numo/narray/gen/tmpl_bit/each.c
new file mode 100644
index 0000000..8825164
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/each.c
@@ -0,0 +1,44 @@
+void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   i;
+    BIT_DIGIT *a1, x=0;
+    size_t   p1;
+    ssize_t  s1;
+    size_t  *idx1;
+    VALUE  y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    if (idx1) {
+        for (; i--;) {
+            LOAD_BIT(a1, p1+*idx1, x); idx1++;
+            y = m_data_to_num(x);
+            rb_yield(y);
+        }
+    } else {
+        for (; i--;) {
+            LOAD_BIT(a1, p1, x); p1+=s1;
+            y = m_data_to_num(x);
+            rb_yield(y);
+        }
+    }
+}
+
+/*
+  Calls the given block once for each element in self,
+  passing that element as a parameter.
+  @overload <%=name%>
+  @return [Numo::NArray] self
+  For a block {|x| ... }
+  @yield [x]  x is element of NArray.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{Qnil,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 1,0, ain,0};
+
+    na_ndloop(&ndf, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/each_with_index.c b/ext/numo/narray/gen/tmpl_bit/each_with_index.c
new file mode 100644
index 0000000..3675382
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/each_with_index.c
@@ -0,0 +1,66 @@
+static inline void
+yield_each_with_index(dtype x, size_t *c, VALUE *a, int nd, int md)
+{
+    int j;
+
+    a[0] = m_data_to_num(x);
+    for (j=0; j<=nd; j++) {
+        a[j+1] = SIZET2NUM(c[j]);
+    }
+    rb_yield(rb_ary_new4(md,a));
+}
+
+
+void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t   i;
+    BIT_DIGIT *a1, x=0;
+    size_t   p1;
+    ssize_t  s1;
+    size_t  *idx1;
+
+    VALUE *a;
+    size_t *c;
+    int nd, md;
+
+    c = (size_t*)(lp->opt_ptr);
+    nd = lp->ndim - 1;
+    md = lp->ndim + 1;
+    a = ALLOCA_N(VALUE,md);
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    c[nd] = 0;
+    if (idx1) {
+        for (; i--;) {
+            LOAD_BIT(a1, p1+*idx1, x); idx1++;
+            yield_each_with_index(x,c,a,nd,md);
+            c[nd]++;
+        }
+    } else {
+        for (; i--;) {
+            LOAD_BIT(a1, p1, x); p1+=s1;
+            yield_each_with_index(x,c,a,nd,md);
+            c[nd]++;
+        }
+    }
+}
+
+/*
+  Invokes the given block once for each element of self,
+  passing that element and indices along each axis as parameters.
+  @overload <%=name%>
+  @return [Numo::NArray] self
+  For a block {|x,i,j,...| ... }
+  @yield [x,i,j,...]  x is an element, i,j,... are multidimensional indices.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{Qnil,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 1,0, ain,0};
+
+    na_ndloop_with_index(&ndf, 1, self);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/extract.c b/ext/numo/narray/gen/tmpl_bit/extract.c
new file mode 100644
index 0000000..75ea589
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/extract.c
@@ -0,0 +1,25 @@
+/*
+  Extract an element only if self is a dimensionless NArray.
+  @overload extract
+  @return [Numeric,Numo::NArray]
+  --- Extract element value as Ruby Object if self is a dimensionless NArray,
+  otherwise returns self.
+*/
+
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    BIT_DIGIT *ptr, val;
+    size_t pos;
+    narray_t *na;
+    GetNArray(self,na);
+
+    if (na->ndim==0) {
+        pos = na_get_offset(self);
+        ptr = (BIT_DIGIT*)na_get_pointer_for_read(self);
+        val = ((*((ptr)+(pos)/NB)) >> ((pos)%NB)) & 1u;
+        na_release_lock(self);
+        return INT2FIX(val);
+    }
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/fill.c b/ext/numo/narray/gen/tmpl_bit/fill.c
new file mode 100644
index 0000000..d7b04ab
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/fill.c
@@ -0,0 +1,65 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  n;
+    size_t  p3;
+    ssize_t s3;
+    size_t *idx3;
+    int     len;
+    BIT_DIGIT *a3;
+    BIT_DIGIT  y;
+    VALUE x = lp->option;
+
+    if (x==INT2FIX(0) || x==Qfalse) {
+        y = 0;
+    } else
+    if (x==INT2FIX(1) || x==Qtrue) {
+        y = ~(BIT_DIGIT)0;
+    } else {
+        rb_raise(rb_eArgError, "invalid value for Bit");
+    }
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR_BIT_IDX(lp, 0, a3, p3, s3, idx3);
+    if (idx3) {
+        y = y&1;
+        for (; n--;) {
+            STORE_BIT(a3, p3+*idx3, y); idx3++;
+        }
+    } else if (s3!=1) {
+        y = y&1;
+        for (; n--;) {
+            STORE_BIT(a3, p3, y); p3+=s3;
+        }
+    } else {
+        if (p3>0 || n<NB) {
+            len = NB - p3;
+            if ((int)n<len) len=n;
+            *a3 = (y & (SLB(len)<<p3)) | (*a3 & ~(SLB(len)<<p3));
+            a3++;
+            n -= len;
+        }
+        for (; n>=NB; n-=NB) {
+            *(a3++) = y;
+        }
+        if (n>0) {
+            *a3 = (y & SLB(n)) | (*a3 & BALL<<n);
+        }
+    }
+}
+
+/*
+  Fill elements with other.
+  @overload <%=name%> other
+  @param [Numeric] other
+  @return [Numo::<%=class_name%>] self.
+*/
+static VALUE
+<%=c_func(1)%>(VALUE self, VALUE val)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{sym_option}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2,0, ain,0};
+
+    na_ndloop(&ndf, 2, self, val);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/format.c b/ext/numo/narray/gen/tmpl_bit/format.c
new file mode 100644
index 0000000..8770f03
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/format.c
@@ -0,0 +1,61 @@
+static VALUE
+format_<%=type_name%>(VALUE fmt, dtype x)
+{
+    if (NIL_P(fmt)) {
+        char s[4];
+        int n;
+        n = m_sprintf(s,x);
+        return rb_str_new(s,n);
+    }
+    return rb_funcall(fmt, '%', 1, m_data_to_num(x));
+}
+
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    BIT_DIGIT *a1, x=0;
+    size_t     p1;
+    char      *p2;
+    ssize_t    s1, s2;
+    size_t    *idx1;
+    VALUE  y;
+    VALUE  fmt = lp->option;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    INIT_PTR(lp, 1, p2, s2);
+
+    if (idx1) {
+        for (; i--;) {
+            LOAD_BIT(a1, p1+*idx1, x); idx1++;
+            y = format_<%=type_name%>(fmt, x);
+            SET_DATA_STRIDE(p2, s2, VALUE, y);
+        }
+    } else {
+        for (; i--;) {
+            LOAD_BIT(a1, p1, x); p1+=s1;
+            y = format_<%=type_name%>(fmt, x);
+            SET_DATA_STRIDE(p2, s2, VALUE, y);
+        }
+    }
+}
+
+/*
+  Format elements into strings.
+  @overload <%=name%> format
+  @param [String] format
+  @return [Numo::RObject] array of formated strings.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE fmt=Qnil;
+
+    ndfunc_arg_in_t ain[2] = {{Qnil,0},{sym_option}};
+    ndfunc_arg_out_t aout[1] = {{numo_cRObject,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 2,1, ain,aout};
+
+    rb_scan_args(argc, argv, "01", &fmt);
+    return na_ndloop(&ndf, 2, self, fmt);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/format_to_a.c b/ext/numo/narray/gen/tmpl_bit/format_to_a.c
new file mode 100644
index 0000000..d0dfd54
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/format_to_a.c
@@ -0,0 +1,48 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    BIT_DIGIT *a1, x=0;
+    size_t     p1;
+    ssize_t    s1;
+    size_t   *idx1;
+    VALUE y;
+    VALUE fmt = lp->option;
+    volatile VALUE a;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    a = rb_ary_new2(i);
+    rb_ary_push(lp->args[1].value, a);
+    if (idx1) {
+        for (; i--;) {
+            LOAD_BIT(a1, p1+*idx1, x); idx1++;
+            y = format_bit(fmt, x);
+            rb_ary_push(a,y);
+        }
+    } else {
+        for (; i--;) {
+            LOAD_BIT(a1, p1, x); p1+=s1;
+            y = format_bit(fmt, x);
+            rb_ary_push(a,y);
+        }
+    }
+}
+
+/*
+  Format elements into strings.
+  @overload <%=name%> format
+  @param [String] format
+  @return [Array] array of formated strings.
+*/
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    volatile VALUE fmt=Qnil;
+    ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
+    ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 3,1, ain,aout};
+
+    rb_scan_args(argc, argv, "01", &fmt);
+    return na_ndloop_cast_narray_to_rarray(&ndf, self, fmt);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/inspect.c b/ext/numo/narray/gen/tmpl_bit/inspect.c
new file mode 100644
index 0000000..cb5cca0
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/inspect.c
@@ -0,0 +1,18 @@
+static VALUE
+<%=c_iter%>(char *ptr, size_t pos, VALUE fmt)
+{
+    dtype x;
+    LOAD_BIT(ptr,pos,x);
+    return format_<%=type_name%>(fmt, x);
+}
+
+/*
+  Returns a string containing a human-readable representation of NArray.
+  @overload inspect
+  @return [String]
+*/
+VALUE
+<%=c_func(0)%>(VALUE ary)
+{
+    return na_ndloop_inspect(ary, <%=c_iter%>, Qnil);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/mask.c b/ext/numo/narray/gen/tmpl_bit/mask.c
new file mode 100644
index 0000000..5a64787
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/mask.c
@@ -0,0 +1,132 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    BIT_DIGIT *a;
+    size_t  p1, p2;
+    ssize_t s1, s2;
+    size_t *idx1, *idx2, *pidx;
+    BIT_DIGIT x=0;
+    size_t  count;
+    where_opt_t *g;
+
+    g = (where_opt_t*)(lp->opt_ptr);
+    count = g->count;
+    pidx  = (size_t*)(g->idx1);
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a, p1, s1, idx1);
+    //INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+    p2 = lp->args[1].iter[0].pos;
+    s2 = lp->args[1].iter[0].step;
+    idx2 = lp->args[1].iter[0].idx;
+
+    if (idx1) {
+        if (idx2) {
+            for (; i--;) {
+                LOAD_BIT(a, p1+*idx1, x);
+                idx1++;
+                if (x) {
+                    *(pidx++) = p2+*idx2;
+                    count++;
+                }
+                idx2++;
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a, p1+*idx1, x);
+                idx1++;
+                if (x) {
+                    *(pidx++) = p2;
+                    count++;
+                }
+                p2 += s2;
+            }
+        }
+    } else {
+        if (idx2) {
+            for (; i--;) {
+                LOAD_BIT(a, p1, x);
+                p1 += s1;
+                if (x) {
+                    *(pidx++) = p2+*idx2;
+                    count++;
+                }
+                idx2++;
+            }
+        } else {
+            for (; i--;) {
+                LOAD_BIT(a, p1, x);
+                p1 += s1;
+                if (x) {
+                    *(pidx++) = p2;
+                    count++;
+                }
+                p2 += s2;
+            }
+        }
+    }
+    g->count = count;
+    g->idx1  = (char*)pidx;
+}
+
+#if   SIZEOF_VOIDP == 8
+#define cIndex numo_cInt64
+#elif SIZEOF_VOIDP == 4
+#define cIndex numo_cInt32
+#endif
+
+/*
+  Return subarray of argument masked with self bit array.
+  @overload <%=op_map%>(array)
+  @param [Numo::NArray] array  narray to be masked.
+  @return [Numo::NArray]  view of masked array.
+*/
+static VALUE
+<%=c_func(1)%>(VALUE mask, VALUE val)
+{
+    volatile VALUE idx_1, view;
+    narray_data_t *nidx;
+    narray_view_t *nv;
+    narray_t      *na;
+    narray_view_t *na1;
+    stridx_t stridx0;
+    size_t n_1;
+    where_opt_t g;
+    ndfunc_arg_in_t ain[2] = {{cT,0},{Qnil,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2, 0, ain, 0};
+
+    n_1 = NUM2SIZET(<%=find_tmpl("count_true").c_func%>(0, NULL, mask));
+    idx_1 = nary_new(cIndex, 1, &n_1);
+    g.count = 0;
+    g.elmsz = SIZEOF_VOIDP;
+    g.idx1 = na_get_pointer_for_write(idx_1);
+    g.idx0 = NULL;
+    na_ndloop3(&ndf, &g, 2, mask, val);
+
+    view = na_s_allocate_view(CLASS_OF(val));
+    GetNArrayView(view, nv);
+    na_setup_shape((narray_t*)nv, 1, &n_1);
+
+    GetNArrayData(idx_1,nidx);
+    SDX_SET_INDEX(stridx0,(size_t*)nidx->ptr);
+    nidx->ptr = NULL;
+
+    nv->stridx = ALLOC_N(stridx_t,1);
+    nv->stridx[0] = stridx0;
+    nv->offset = 0;
+
+    GetNArray(val, na);
+    switch(NA_TYPE(na)) {
+    case NARRAY_DATA_T:
+        nv->data = val;
+        break;
+    case NARRAY_VIEW_T:
+        GetNArrayView(val, na1);
+        nv->data = na1->data;
+        break;
+    default:
+        rb_raise(rb_eRuntimeError,"invalid NA_TYPE: %d",NA_TYPE(na));
+    }
+
+    return view;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/none_p.c b/ext/numo/narray/gen/tmpl_bit/none_p.c
new file mode 100644
index 0000000..31e8213
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/none_p.c
@@ -0,0 +1,14 @@
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
+{
+    VALUE v;
+
+    v = <%=find_tmpl("any?").c_func%>(argc,argv,self);
+
+    if (v==Qtrue) {
+        return Qfalse;
+    } else if (v==Qfalse) {
+        return Qtrue;
+    }
+    return <%=find_tmpl("not").c_func%>(v);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/store_array.c b/ext/numo/narray/gen/tmpl_bit/store_array.c
new file mode 100644
index 0000000..23002da
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/store_array.c
@@ -0,0 +1,104 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t i, n;
+    size_t i1, n1;
+    VALUE  v1, *ptr;
+    BIT_DIGIT *a1;
+    size_t p1;
+    size_t s1, *idx1;
+    VALUE  x;
+    double y;
+    BIT_DIGIT z;
+    size_t len, c;
+    double beg, step;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    v1 = lp->args[1].value;
+    i = 0;
+
+    if (lp->args[1].ptr) {
+        if (v1 == Qtrue) {
+            iter_<%=type_name%>_store_<%=type_name%>(lp);
+            i = lp->args[1].shape[0];
+            if (idx1) {
+                idx1 += i;
+            } else {
+                p1 += s1 * i;
+            }
+        }
+        goto loop_end;
+    }
+
+    ptr = &v1;
+
+    switch(TYPE(v1)) {
+    case T_ARRAY:
+        n1 = RARRAY_LEN(v1);
+        ptr = RARRAY_PTR(v1);
+        break;
+    case T_NIL:
+        n1 = 0;
+        break;
+    default:
+        n1 = 1;
+    }
+
+    if (idx1) {
+        for (i=i1=0; i1<n1 && i<n; i++,i1++) {
+            x = ptr[i1];
+            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, na_cStep)) {
+                nary_step_sequence(x,&len,&beg,&step);
+                for (c=0; c<len && i<n; c++,i++) {
+                    y = beg + step * c;
+                    z = m_from_double(y);
+                    STORE_BIT(a1, p1+*idx1, z); idx1++;
+                }
+            }
+            if (TYPE(x) != T_ARRAY) {
+                if (x == Qnil) x = INT2FIX(0);
+                z = m_num_to_data(x);
+                STORE_BIT(a1, p1+*idx1, z); idx1++;
+            }
+        }
+    } else {
+        for (i=i1=0; i1<n1 && i<n; i++,i1++) {
+            x = ptr[i1];
+            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, na_cStep)) {
+                nary_step_sequence(x,&len,&beg,&step);
+                for (c=0; c<len && i<n; c++,i++) {
+                    y = beg + step * c;
+                    z = m_from_double(y);
+                    STORE_BIT(a1, p1, z); p1+=s1;
+                }
+            }
+            if (TYPE(x) != T_ARRAY) {
+                z = m_num_to_data(x);
+                STORE_BIT(a1, p1, z); p1+=s1;
+            }
+        }
+    }
+
+ loop_end:
+    z = m_zero;
+    if (idx1) {
+        for (; i<n; i++) {
+            STORE_BIT(a1, p1+*idx1, z); idx1++;
+        }
+    } else {
+        for (; i<n; i++) {
+            STORE_BIT(a1, p1, z); p1+=s1;
+        }
+    }
+}
+
+static VALUE
+<%=c_func(:nodef)%>(VALUE self, VALUE rary)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0}, {rb_cArray,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2, 0, ain, 0};
+
+    na_ndloop_store_rarray(&ndf, self, rary);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/store_bit.c b/ext/numo/narray/gen/tmpl_bit/store_bit.c
new file mode 100644
index 0000000..e2eb50a
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/store_bit.c
@@ -0,0 +1,66 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  n;
+    size_t  p1, p3;
+    ssize_t s1, s3;
+    size_t *idx1, *idx3;
+    int     o1, l1, r1, len;
+    BIT_DIGIT *a1, *a3;
+    BIT_DIGIT  x;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR_BIT_IDX(lp, 0, a3, p3, s3, idx3);
+    INIT_PTR_BIT_IDX(lp, 1, a1, p1, s1, idx1);
+    if (s1!=1 || s3!=1 || idx1 || idx3) {
+        for (; n--;) {
+            LOAD_BIT_STEP(a1, p1, s1, idx1, x);
+            STORE_BIT_STEP(a3, p3, s3, idx3, x);
+        }
+    } else {
+        o1 =  p1 % NB;
+        o1 -= p3;
+        l1 =  NB+o1;
+        r1 =  NB-o1;
+        if (p3>0 || n<NB) {
+            len = NB - p3;
+            if ((int)n<len) len=n;
+            if (o1>=0) x = *a1>>o1;
+            else       x = *a1<<-o1;
+            if (p1+len>NB)  x |= *(a1+1)<<r1;
+            a1++;
+            *a3 = (x & (SLB(len)<<p3)) | (*a3 & ~(SLB(len)<<p3));
+            a3++;
+            n -= len;
+        }
+        if (o1==0) {
+            for (; n>=NB; n-=NB) {
+                x = *(a1++);
+                *(a3++) = x;
+            }
+        } else {
+            for (; n>=NB; n-=NB) {
+                x = *a1>>o1;
+                if (o1<0)  x |= *(a1-1)>>l1;
+                if (o1>0)  x |= *(a1+1)<<r1;
+                a1++;
+                *(a3++) = x;
+            }
+        }
+        if (n>0) {
+            x = *a1>>o1;
+            if (o1<0)  x |= *(a1-1)>>l1;
+            *a3 = (x & SLB(n)) | (*a3 & BALL<<n);
+        }
+    }
+}
+
+static VALUE
+<%=c_func(:nodef)%>(VALUE self, VALUE obj)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{Qnil,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2,0, ain,0};
+
+    na_ndloop(&ndf, 2, self, obj);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/store_from.c b/ext/numo/narray/gen/tmpl_bit/store_from.c
new file mode 100644
index 0000000..84330d9
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/store_from.c
@@ -0,0 +1,56 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    ssize_t  i, s1, s2;
+    size_t   p1;
+    char    *p2;
+    size_t  *idx1, *idx2;
+    <%=dtype%> x;
+    BIT_DIGIT *a1;
+    BIT_DIGIT  y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+
+    if (idx2) {
+        if (idx1) {
+            for (; i--;) {
+                GET_DATA_INDEX(p2,idx2,<%=dtype%>,x);
+                y = <%=macro%>(x);
+                STORE_BIT(a1, p1+*idx1, y); idx1++;
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_INDEX(p2,idx2,<%=dtype%>,x);
+                y = <%=macro%>(x);
+                STORE_BIT(a1, p1, y); p1+=s1;
+            }
+        }
+    } else {
+        if (idx1) {
+            for (; i--;) {
+                GET_DATA_STRIDE(p2,s2,<%=dtype%>,x);
+                y = <%=macro%>(x);
+                STORE_BIT(a1, p1+*idx1, y); idx1++;
+            }
+        } else {
+            for (; i--;) {
+                GET_DATA_STRIDE(p2,s2,<%=dtype%>,x);
+                y = <%=macro%>(x);
+                STORE_BIT(a1, p1, y); p1+=s1;
+            }
+        }
+    }
+}
+
+
+static VALUE
+<%=c_func(:nodef)%>(VALUE self, VALUE obj)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{Qnil,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2,0, ain,0};
+
+    na_ndloop(&ndf, 2, self, obj);
+    return self;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/to_a.c b/ext/numo/narray/gen/tmpl_bit/to_a.c
new file mode 100644
index 0000000..fef82a2
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/to_a.c
@@ -0,0 +1,43 @@
+void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t     i;
+    BIT_DIGIT *a1;
+    size_t     p1;
+    ssize_t    s1;
+    size_t    *idx1;
+    BIT_DIGIT  x=0;
+    VALUE      a, y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    a = rb_ary_new2(i);
+    rb_ary_push(lp->args[1].value, a);
+    if (idx1) {
+        for (; i--;) {
+            LOAD_BIT(a1,p1+*idx1,x); idx1++;
+            y = m_data_to_num(x);
+            rb_ary_push(a,y);
+        }
+    } else {
+        for (; i--;) {
+            LOAD_BIT(a1,p1,x); p1+=s1;
+            y = m_data_to_num(x);
+            rb_ary_push(a,y);
+        }
+    }
+}
+
+/*
+  Convert self to Array.
+  @overload <%=name%>
+  @return [Array]
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
+    ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 3,1, ain,aout};
+    return na_ndloop_cast_narray_to_rarray(&ndf, self, Qnil);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/unary.c b/ext/numo/narray/gen/tmpl_bit/unary.c
new file mode 100644
index 0000000..0d6434a
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/unary.c
@@ -0,0 +1,77 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  n;
+    size_t  p1, p3;
+    ssize_t s1, s3;
+    size_t *idx1, *idx3;
+    int     o1, l1, r1, len;
+    BIT_DIGIT *a1, *a3;
+    BIT_DIGIT  x;
+    BIT_DIGIT  y;
+
+    INIT_COUNTER(lp, n);
+    INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
+    INIT_PTR_BIT_IDX(lp, 1, a3, p3, s3, idx3);
+    if (s1!=1 || s3!=1 || idx1 || idx3) {
+        for (; n--;) {
+            LOAD_BIT_STEP(a1, p1, s1, idx1, x);
+            y = m_<%=name%>(x);
+            STORE_BIT_STEP(a3, p3, s3, idx3, y);
+        }
+    } else {
+        o1 =  p1 % NB;
+        o1 -= p3;
+        l1 =  NB+o1;
+        r1 =  NB-o1;
+        if (p3>0 || n<NB) {
+            len = NB - p3;
+            if ((int)n<len) len=n;
+            if (o1>=0) x = *a1>>o1;
+            else       x = *a1<<-o1;
+            if (p1+len>NB)  x |= *(a1+1)<<r1;
+            a1++;
+            y = m_<%=name%>(x);
+            *a3 = (y & (SLB(len)<<p3)) | (*a3 & ~(SLB(len)<<p3));
+            a3++;
+            n -= len;
+        }
+        if (o1==0) {
+            for (; n>=NB; n-=NB) {
+                x = *(a1++);
+                y = m_<%=name%>(x);
+                *(a3++) = y;
+            }
+        } else {
+            for (; n>=NB; n-=NB) {
+                x = *a1>>o1;
+                if (o1<0)  x |= *(a1-1)>>l1;
+                if (o1>0)  x |= *(a1+1)<<r1;
+                a1++;
+                y = m_<%=name%>(x);
+                *(a3++) = y;
+            }
+        }
+        if (n>0) {
+            x = *a1>>o1;
+            if (o1<0)  x |= *(a1-1)>>l1;
+            y = m_<%=name%>(x);
+            *a3 = (y & SLB(n)) | (*a3 & BALL<<n);
+        }
+    }
+}
+
+/*
+  Unary <%=name%>.
+  @overload <%=name%>
+  @return [Numo::<%=class_name%>] <%=name%> of self.
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_arg_out_t aout[1] = {{cT,0}};
+    ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 1,1, ain,aout};
+
+    return na_ndloop(&ndf, 1, self);
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/where.c b/ext/numo/narray/gen/tmpl_bit/where.c
new file mode 100644
index 0000000..9760e6d
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/where.c
@@ -0,0 +1,86 @@
+typedef struct {
+    size_t count;
+    char  *idx0;
+    char  *idx1;
+    size_t elmsz;
+} where_opt_t;
+
+#define STORE_INT(ptr, esz, x) memcpy(ptr,&(x),esz)
+
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    BIT_DIGIT *a;
+    size_t  p;
+    ssize_t s;
+    size_t *idx;
+    BIT_DIGIT x=0;
+    char   *idx1;
+    size_t  count;
+    size_t  e;
+    where_opt_t *g;
+
+    g = (where_opt_t*)(lp->opt_ptr);
+    count = g->count;
+    idx1  = g->idx1;
+    e     = g->elmsz;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a, p, s, idx);
+    if (idx) {
+        for (; i--;) {
+            LOAD_BIT(a, p+*idx, x);
+            idx++;
+            if (x!=0) {
+                STORE_INT(idx1,e,count);
+                idx1 += e;
+            }
+            count++;
+        }
+    } else {
+        for (; i--;) {
+            LOAD_BIT(a, p, x);
+            p+=s;
+            if (x!=0) {
+                STORE_INT(idx1,e,count);
+                idx1 += e;
+            }
+            count++;
+        }
+    }
+    g->count = count;
+    g->idx1  = idx1;
+}
+
+/*
+  Returns the array of index where the bit is one (true).
+  @overload <%=op_map%>
+  @return [Numo::Int32,Numo::Int64]
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    volatile VALUE idx_1;
+    size_t size, n_1;
+    where_opt_t *g;
+
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 1, 0, ain, 0 };
+
+    size = RNARRAY_SIZE(self);
+    n_1 = NUM2SIZET(<%=find_tmpl("count_true").c_func%>(0, NULL, self));
+    g = ALLOCA_N(where_opt_t,1);
+    g->count = 0;
+    if (size>4294967295ul) {
+        idx_1 = nary_new(numo_cInt64, 1, &n_1);
+        g->elmsz = 8;
+    } else {
+        idx_1 = nary_new(numo_cInt32, 1, &n_1);
+        g->elmsz = 4;
+    }
+    g->idx1 = na_get_pointer_for_write(idx_1);
+    g->idx0 = NULL;
+    na_ndloop3(&ndf, g, 1, self);
+    na_release_lock(idx_1);
+    return idx_1;
+}
diff --git a/ext/numo/narray/gen/tmpl_bit/where2.c b/ext/numo/narray/gen/tmpl_bit/where2.c
new file mode 100644
index 0000000..de28efe
--- /dev/null
+++ b/ext/numo/narray/gen/tmpl_bit/where2.c
@@ -0,0 +1,91 @@
+static void
+<%=c_iter%>(na_loop_t *const lp)
+{
+    size_t  i;
+    BIT_DIGIT *a;
+    size_t  p;
+    ssize_t s;
+    size_t *idx;
+    BIT_DIGIT x=0;
+    char   *idx0, *idx1;
+    size_t  count;
+    size_t  e;
+    where_opt_t *g;
+
+    g = (where_opt_t*)(lp->opt_ptr);
+    count = g->count;
+    idx0  = g->idx0;
+    idx1  = g->idx1;
+    e     = g->elmsz;
+    INIT_COUNTER(lp, i);
+    INIT_PTR_BIT_IDX(lp, 0, a, p, s, idx);
+    if (idx) {
+        for (; i--;) {
+            LOAD_BIT(a, p+*idx, x);
+            idx++;
+            if (x==0) {
+                STORE_INT(idx0,e,count);
+                idx0 += e;
+            } else {
+                STORE_INT(idx1,e,count);
+                idx1 += e;
+            }
+            count++;
+        }
+    } else {
+        for (; i--;) {
+            LOAD_BIT(a, p, x);
+            p+=s;
+            if (x==0) {
+                STORE_INT(idx0,e,count);
+                idx0 += e;
+            } else {
+                STORE_INT(idx1,e,count);
+                idx1 += e;
+            }
+            count++;
+        }
+    }
+    g->count = count;
+    g->idx0  = idx0;
+    g->idx1  = idx1;
+}
+
+/*
+  Returns two index arrays.
+  The first array contains index where the bit is one (true).
+  The second array contains index where the bit is zero (false).
+  @overload <%=op_map%>
+  @return [Numo::Int32,Numo::Int64]*2
+*/
+static VALUE
+<%=c_func(0)%>(VALUE self)
+{
+    VALUE idx_1, idx_0;
+    size_t size, n_1, n_0;
+    where_opt_t *g;
+
+    ndfunc_arg_in_t ain[1] = {{cT,0}};
+    ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 1, 0, ain, 0 };
+
+    size = RNARRAY_SIZE(self);
+    n_1 = NUM2SIZET(<%=find_tmpl("count_true").c_func%>(0, NULL, self));
+    n_0 = size - n_1;
+    g = ALLOCA_N(where_opt_t,1);
+    g->count = 0;
+    if (size>4294967295ul) {
+        idx_1 = nary_new(numo_cInt64, 1, &n_1);
+        idx_0 = nary_new(numo_cInt64, 1, &n_0);
+        g->elmsz = 8;
+    } else {
+        idx_1 = nary_new(numo_cInt32, 1, &n_1);
+        idx_0 = nary_new(numo_cInt32, 1, &n_0);
+        g->elmsz = 4;
+    }
+    g->idx1 = na_get_pointer_for_write(idx_1);
+    g->idx0 = na_get_pointer_for_write(idx_0);
+    na_ndloop3(&ndf, g, 1, self);
+    na_release_lock(idx_0);
+    na_release_lock(idx_1);
+    return rb_assoc_new(idx_1,idx_0);
+}
diff --git a/ext/numo/narray/index.c b/ext/numo/narray/index.c
new file mode 100644
index 0000000..3a649b3
--- /dev/null
+++ b/ext/numo/narray/index.c
@@ -0,0 +1,842 @@
+/*
+  index.c
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+//#define NARRAY_C
+
+#include <string.h>
+#include <ruby.h>
+#include "numo/narray.h"
+#include "numo/template.h"
+
+#if   SIZEOF_VOIDP == 8
+#define cIndex numo_cInt64
+#elif SIZEOF_VOIDP == 4
+#define cIndex numo_cInt32
+#endif
+
+// from ruby/enumerator.c
+struct enumerator {
+    VALUE obj;
+    ID    meth;
+    VALUE args;
+    // use only above in this source
+    VALUE fib;
+    VALUE dst;
+    VALUE lookahead;
+    VALUE feedvalue;
+    VALUE stop_exc;
+    VALUE size;
+    // incompatible below depending on ruby version
+    //VALUE procs;                      // ruby 2.4
+    //rb_enumerator_size_func *size_fn; // ruby 2.1-2.4
+    //VALUE (*size_fn)(ANYARGS);        // ruby 2.0
+};
+
+// note: the memory refed by this pointer is not freed and causes memroy leak.
+typedef struct {
+    size_t  n; // the number of elements of the dimesnion
+    size_t  beg; // the starting point in the dimension
+    ssize_t step; // the step size of the dimension
+    size_t *idx; // list of indices
+    int     reduce; // true if the dimension is reduced by addition
+    int     orig_dim; // the dimension of original array
+} na_index_arg_t;
+
+
+static void
+print_index_arg(na_index_arg_t *q, int n)
+{
+    int i;
+    printf("na_index_arg_t = 0x%"SZF"x {\n",(size_t)q);
+    for (i=0; i<n; i++) {
+        printf("  q[%d].n=%"SZF"d\n",i,q[i].n);
+        printf("  q[%d].beg=%"SZF"d\n",i,q[i].beg);
+        printf("  q[%d].step=%"SZF"d\n",i,q[i].step);
+        printf("  q[%d].idx=0x%"SZF"x\n",i,(size_t)q[i].idx);
+        printf("  q[%d].reduce=0x%x\n",i,q[i].reduce);
+        printf("  q[%d].orig_dim=%d\n",i,q[i].orig_dim);
+    }
+    printf("}\n");
+}
+
+static VALUE sym_ast;
+static VALUE sym_all;
+//static VALUE sym_reduce;
+static VALUE sym_minus;
+static VALUE sym_new;
+static VALUE sym_reverse;
+static VALUE sym_plus;
+static VALUE sym_sum;
+static VALUE sym_tilde;
+static VALUE sym_rest;
+static ID id_beg;
+static ID id_end;
+static ID id_exclude_end;
+static ID id_each;
+static ID id_step;
+static ID id_dup;
+static ID id_bracket;
+static ID id_shift_left;
+static ID id_mask;
+
+
+void
+na_index_set_step(na_index_arg_t *q, int i, size_t n, size_t beg, ssize_t step)
+{
+    q->n    = n;
+    q->beg  = beg;
+    q->step = step;
+    q->idx  = NULL;
+    q->reduce = 0;
+    q->orig_dim = i;
+}
+
+void
+na_index_set_scalar(na_index_arg_t *q, int i, ssize_t size, ssize_t x)
+{
+    if (x < -size || x >= size)
+        rb_raise(rb_eRangeError,
+                  "array index (%"SZF"d) is out of array size (%"SZF"d)",
+                  x, size);
+    if (x < 0)
+        x += size;
+    q->n    = 1;
+    q->beg  = x;
+    q->step = 0;
+    q->idx  = NULL;
+    q->reduce = 0;
+    q->orig_dim = i;
+}
+
+static inline ssize_t
+na_range_check(ssize_t pos, ssize_t size, int dim)
+{
+    ssize_t idx=pos;
+
+    if (idx < 0) idx += size;
+    if (idx < 0 || idx >= size) {
+        rb_raise(rb_eIndexError, "index=%"SZF"d out of shape[%d]=%"SZF"d",
+                 pos, dim, size);
+    }
+    return idx;
+}
+
+static void
+na_parse_array(VALUE ary, int orig_dim, ssize_t size, na_index_arg_t *q)
+{
+    int k;
+    int n = RARRAY_LEN(ary);
+    q->idx = ALLOC_N(size_t, n);
+    for (k=0; k<n; k++) {
+        q->idx[k] = na_range_check(NUM2SSIZET(RARRAY_AREF(ary,k)), size, orig_dim);
+    }
+    q->n    = n;
+    q->beg  = 0;
+    q->step = 1;
+    q->reduce = 0;
+    q->orig_dim = orig_dim;
+}
+
+static void
+na_parse_narray_index(VALUE a, int orig_dim, ssize_t size, na_index_arg_t *q)
+{
+    VALUE idx;
+    narray_t *na;
+    narray_data_t *nidx;
+    size_t k, n;
+    ssize_t *nidxp;
+
+    GetNArray(a,na);
+    if (NA_NDIM(na) != 1) {
+        rb_raise(rb_eIndexError, "should be 1-d NArray");
+    }
+    n = NA_SIZE(na);
+    idx = nary_new(cIndex,1,&n);
+    na_store(idx,a);
+
+    GetNArrayData(idx,nidx);
+    nidxp   = (ssize_t*)nidx->ptr;
+    q->idx  = ALLOC_N(size_t, n);
+    for (k=0; k<n; k++) {
+        q->idx[k] = na_range_check(nidxp[k], size, orig_dim);
+    }
+    q->n    = n;
+    q->beg  = 0;
+    q->step = 1;
+    q->reduce = 0;
+    q->orig_dim = orig_dim;
+}
+
+static void
+na_parse_range(VALUE range, ssize_t step, int orig_dim, ssize_t size, na_index_arg_t *q)
+{
+    int n;
+    ssize_t beg, end;
+
+    beg = NUM2LONG(rb_funcall(range,id_beg,0));
+    if (beg<0) {
+        beg += size;
+    }
+
+    end = NUM2LONG(rb_funcall(range,id_end,0));
+    if (end<0) {
+        end += size;
+    }
+
+    if (RTEST(rb_funcall(range,id_exclude_end,0))) {
+        end--;
+    }
+    if (beg < -size || beg >= size ||
+        end < -size || end >= size) {
+        rb_raise(rb_eRangeError,
+                 "beg=%"SZF"d,end=%"SZF"d is out of array size (%"SZF"d)",
+                 beg, end, size);
+    }
+    n = (end-beg)/step+1;
+    if (n<0) n=0;
+    na_index_set_step(q,orig_dim,n,beg,step);
+
+}
+
+static void
+na_parse_enumerator(VALUE enum_obj, int orig_dim, ssize_t size, na_index_arg_t *q)
+{
+    int len;
+    ssize_t step;
+    struct enumerator *e;
+
+    if (!RB_TYPE_P(enum_obj, T_DATA)) {
+        rb_raise(rb_eTypeError,"wrong argument type (not T_DATA)");
+    }
+    e = (struct enumerator *)DATA_PTR(enum_obj);
+
+    if (rb_obj_is_kind_of(e->obj, rb_cRange)) {
+        if (e->meth == id_each) {
+            na_parse_range(e->obj, 1, orig_dim, size, q);
+        }
+        else if (e->meth == id_step) {
+            if (TYPE(e->args) != T_ARRAY) {
+                rb_raise(rb_eArgError,"no argument for step");
+            }
+            len = RARRAY_LEN(e->args);
+            if (len != 1) {
+                rb_raise(rb_eArgError,"invalid number of step argument (1 for %d)",len);
+            }
+            step = NUM2SSIZET(RARRAY_AREF(e->args,0));
+            na_parse_range(e->obj, step, orig_dim, size, q);
+        } else {
+            rb_raise(rb_eTypeError,"unknown Range method: %s",rb_id2name(e->meth));
+        }
+    } else {
+        rb_raise(rb_eTypeError,"not Range object");
+    }
+}
+
+// Analyze *a* which is *i*-th index object and store the information to q
+//
+// a: a ruby object of i-th index
+// size: size of i-th dimension of original NArray
+// i: parse i-th index
+// q: parsed information is stored to *q
+static void
+na_index_parse_each(volatile VALUE a, ssize_t size, int i, na_index_arg_t *q)
+{
+    switch(TYPE(a)) {
+
+    case T_FIXNUM:
+        na_index_set_scalar(q,i,size,FIX2LONG(a));
+        break;
+
+    case T_BIGNUM:
+        na_index_set_scalar(q,i,size,NUM2SSIZET(a));
+        break;
+
+    case T_FLOAT:
+        na_index_set_scalar(q,i,size,NUM2SSIZET(a));
+        break;
+
+    case T_NIL:
+    case T_TRUE:
+        na_index_set_step(q,i,size,0,1);
+        break;
+
+    case T_SYMBOL:
+        if (a==sym_all || a==sym_ast) {
+            na_index_set_step(q,i,size,0,1);
+        }
+        else if (a==sym_reverse) {
+            na_index_set_step(q,i,size,size-1,-1);
+        }
+        else if (a==sym_new) {
+            na_index_set_step(q,i,1,0,1);
+        }
+        else if (a==sym_reduce || a==sym_sum || a==sym_plus) {
+            na_index_set_step(q,i,size,0,1);
+            q->reduce = 1;
+        } else {
+            rb_raise(rb_eIndexError, "invalid symbol for index");
+        }
+        break;
+
+    case T_ARRAY:
+        na_parse_array(a, i, size, q);
+        break;
+
+    default:
+        if (rb_obj_is_kind_of(a, rb_cRange)) {
+            na_parse_range(a, 1, i, size, q);
+        }
+        else if (rb_obj_is_kind_of(a, rb_cEnumerator)) {
+            na_parse_enumerator(a, i, size, q);
+        }
+        else if (rb_obj_is_kind_of(a, na_cStep)) {
+            ssize_t beg, step, n;
+            nary_step_array_index(a, size, (size_t*)(&n), &beg, &step);
+            na_index_set_step(q,i,n,beg,step);
+        }
+        // NArray index
+        else if (NA_IsNArray(a)) {
+            na_parse_narray_index(a, i, size, q);
+        }
+        else {
+            rb_raise(rb_eIndexError, "not allowed type");
+        }
+    }
+}
+
+
+static size_t
+na_index_parse_args(VALUE args, narray_t *na, na_index_arg_t *q, int ndim)
+{
+    int i, j, k, l, nidx;
+    size_t total=1;
+    VALUE v;
+
+    nidx = RARRAY_LEN(args);
+
+    for (i=j=k=0; i<nidx; i++) {
+        v = RARRAY_AREF(args,i);
+        // rest (ellipsis) dimension
+        if (v==Qfalse) {
+            for (l = ndim - (nidx-1); l>0; l--) {
+                //printf("i=%d j=%d k=%d l=%d ndim=%d nidx=%d\n",i,j,k,l,ndim,nidx);
+                na_index_parse_each(Qtrue, na->shape[k], k, &q[j]);
+                if (q[j].n > 1) {
+                    total *= q[j].n;
+                }
+                j++;
+                k++;
+            }
+        }
+        // new dimension
+        else if (v==sym_new) {
+            na_index_parse_each(v, 1, k, &q[j]);
+            j++;
+        }
+        // other dimention
+        else {
+            na_index_parse_each(v, na->shape[k], k, &q[j]);
+            if (q[j].n > 1) {
+                total *= q[j].n;
+            }
+            j++;
+            k++;
+        }
+    }
+    return total;
+}
+
+
+static void
+na_get_strides_nadata(const narray_data_t *na, ssize_t *strides, ssize_t elmsz)
+{
+    int i = na->base.ndim - 1;
+    strides[i] = elmsz;
+    for (; i>0; i--) {
+        strides[i-1] = strides[i] * na->base.shape[i];
+    }
+}
+
+static void
+na_index_aref_nadata(narray_data_t *na1, narray_view_t *na2,
+                     na_index_arg_t *q, ssize_t elmsz, int ndim, int keep_dim)
+{
+    int i, j;
+    ssize_t size, k, total=1;
+    ssize_t stride1;
+    ssize_t *strides_na1;
+    size_t  *index;
+    ssize_t beg, step;
+    VALUE m;
+
+    strides_na1 = ALLOCA_N(ssize_t, na1->base.ndim);
+    na_get_strides_nadata(na1, strides_na1, elmsz);
+
+    for (i=j=0; i<ndim; i++) {
+        stride1 = strides_na1[q[i].orig_dim];
+
+        // numeric index -- trim dimension
+        if (!keep_dim && q[i].n==1 && q[i].step==0) {
+            beg  = q[i].beg;
+            na2->offset += stride1 * beg;
+            continue;
+        }
+
+        na2->base.shape[j] = size = q[i].n;
+
+        if (q[i].reduce != 0) {
+            m = rb_funcall(INT2FIX(1),id_shift_left,1,INT2FIX(j));
+            na2->base.reduce = rb_funcall(m,'|',1,na2->base.reduce);
+        }
+
+        // array index
+        if (q[i].idx != NULL) {
+            index = q[i].idx;
+            SDX_SET_INDEX(na2->stridx[j],index);
+            q[i].idx = NULL;
+            for (k=0; k<size; k++) {
+                index[k] = index[k] * stride1;
+            }
+        } else {
+            beg  = q[i].beg;
+            step = q[i].step;
+            na2->offset += stride1*beg;
+            SDX_SET_STRIDE(na2->stridx[j], stride1*step);
+        }
+        j++;
+        total *= size;
+    }
+    na2->base.size = total;
+}
+
+
+static void
+na_index_aref_naview(narray_view_t *na1, narray_view_t *na2,
+                     na_index_arg_t *q, ssize_t elmsz, int ndim, int keep_dim)
+{
+    int i, j;
+    ssize_t total=1;
+
+    for (i=j=0; i<ndim; i++) {
+        stridx_t sdx1 = na1->stridx[q[i].orig_dim];
+        ssize_t size;
+
+        // numeric index -- trim dimension
+        if (!keep_dim && q[i].n==1 && q[i].step==0) {
+            if (SDX_IS_INDEX(sdx1)) {
+                na2->offset += SDX_GET_INDEX(sdx1)[q[i].beg];
+            } else {
+                na2->offset += SDX_GET_STRIDE(sdx1)*q[i].beg;
+            }
+            continue;
+        }
+
+        na2->base.shape[j] = size = q[i].n;
+
+        if (q[i].reduce != 0) {
+            VALUE m = rb_funcall(INT2FIX(1),id_shift_left,1,INT2FIX(j));
+            na2->base.reduce = rb_funcall(m,'|',1,na2->base.reduce);
+        }
+
+        if (q[i].orig_dim >= na1->base.ndim) {
+            // new dimension
+            SDX_SET_STRIDE(na2->stridx[j], elmsz);
+        }
+        else if (q[i].idx != NULL && SDX_IS_INDEX(sdx1)) {
+            // index <- index
+            int k;
+            size_t *index = q[i].idx;
+            SDX_SET_INDEX(na2->stridx[j], index);
+            q[i].idx = NULL;
+
+            for (k=0; k<size; k++) {
+                index[k] = SDX_GET_INDEX(sdx1)[index[k]];
+            }
+        }
+        else if (q[i].idx != NULL && SDX_IS_STRIDE(sdx1)) {
+            // index <- step
+            ssize_t stride1 = SDX_GET_STRIDE(sdx1);
+            size_t *index = q[i].idx;
+            SDX_SET_INDEX(na2->stridx[j],index);
+            q[i].idx = NULL;
+
+            if (stride1<0) {
+                size_t  last;
+                int k;
+                stride1 = -stride1;
+                last = na1->base.shape[q[i].orig_dim] - 1;
+                if (na2->offset < last * stride1) {
+                    rb_raise(rb_eStandardError,"bug: negative offset");
+                }
+                na2->offset -= last * stride1;
+                for (k=0; k<size; k++) {
+                    index[k] = (last - index[k]) * stride1;
+                }
+            } else {
+                int k;
+                for (k=0; k<size; k++) {
+                    index[k] = index[k] * stride1;
+                }
+            }
+        }
+        else if (q[i].idx == NULL && SDX_IS_INDEX(sdx1)) {
+            // step <- index
+            int k;
+            size_t beg  = q[i].beg;
+            ssize_t step = q[i].step;
+            size_t *index = ALLOC_N(size_t, size);
+            SDX_SET_INDEX(na2->stridx[j],index);
+            for (k=0; k<size; k++) {
+                index[k] = SDX_GET_INDEX(sdx1)[beg+step*k];
+            }
+        }
+        else if (q[i].idx == NULL && SDX_IS_STRIDE(sdx1)) {
+            // step <- step
+            size_t beg  = q[i].beg;
+            ssize_t step = q[i].step;
+            ssize_t stride1 = SDX_GET_STRIDE(sdx1);
+            na2->offset += stride1*beg;
+            SDX_SET_STRIDE(na2->stridx[j], stride1*step);
+        }
+
+        j++;
+        total *= size;
+    }
+    na2->base.size = total;
+}
+
+
+static int
+na_ndim_new_narray(int ndim, const na_index_arg_t *q)
+{
+    int i, ndim_new=0;
+    for (i=0; i<ndim; i++) {
+        if (q[i].n>1 || q[i].step!=0) {
+            ndim_new++;
+        }
+    }
+    return ndim_new;
+}
+
+typedef struct {
+    VALUE args, self, store;
+    int ndim;
+    na_index_arg_t *q;
+    narray_t *na1;
+    int keep_dim;
+} na_aref_md_data_t;
+
+static na_index_arg_t*
+na_allocate_index_args(int ndim)
+{
+    na_index_arg_t *q = ALLOC_N(na_index_arg_t, ndim);
+    int i;
+
+    for (i=0; i<ndim; i++) {
+        q[i].idx = NULL;
+    }
+    return q;
+}
+
+static
+VALUE na_aref_md_protected(VALUE data_value)
+{
+    na_aref_md_data_t *data = (na_aref_md_data_t*)(data_value);
+    VALUE self = data->self;
+    VALUE args = data->args;
+    VALUE store = data->store;
+    int ndim = data->ndim;
+    na_index_arg_t *q = data->q;
+    narray_t *na1 = data->na1;
+    int keep_dim = data->keep_dim;
+
+    int ndim_new;
+    VALUE view;
+    narray_view_t *na2;
+    ssize_t elmsz;
+
+    na_index_parse_args(args, na1, q, ndim);
+
+    if (na_debug_flag) print_index_arg(q,ndim);
+
+    if (keep_dim) {
+        ndim_new = ndim;
+    } else {
+        ndim_new = na_ndim_new_narray(ndim, q);
+    }
+    view = na_s_allocate_view(CLASS_OF(self));
+
+    na_copy_flags(self, view);
+    GetNArrayView(view,na2);
+
+    na_alloc_shape((narray_t*)na2, ndim_new);
+
+    na2->stridx = ALLOC_N(stridx_t,ndim_new);
+
+    elmsz = nary_element_stride(self);
+
+    switch(na1->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        na_index_aref_nadata((narray_data_t *)na1,na2,q,elmsz,ndim,keep_dim);
+        na2->data = self;
+        break;
+    case NARRAY_VIEW_T:
+        na2->offset = ((narray_view_t *)na1)->offset;
+        na2->data = ((narray_view_t *)na1)->data;
+        na_index_aref_naview((narray_view_t *)na1,na2,q,elmsz,ndim,keep_dim);
+        break;
+    }
+    if (store) {
+        na_get_pointer_for_write(store); // allocate memory
+        na_store(na_flatten_dim(store,0),view);
+        return store;
+    }
+    return view;
+}
+
+static VALUE
+na_aref_md_ensure(VALUE data_value)
+{
+    na_aref_md_data_t *data = (na_aref_md_data_t*)(data_value);
+    int i;
+    for (i=0; i<data->ndim; i++) {
+        xfree(data->q[i].idx);
+    }
+    xfree(data->q);
+    return Qnil;
+}
+
+VALUE
+na_aref_md(int argc, VALUE *argv, VALUE self, int keep_dim, int result_nd)
+{
+    VALUE args; // should be GC protected
+    narray_t *na1;
+    na_aref_md_data_t data;
+    VALUE store = 0;
+    VALUE idx;
+    narray_t *nidx;
+
+    GetNArray(self,na1);
+
+    args = rb_ary_new4(argc,argv);
+
+    if (argc == 1 && result_nd == 1) {
+        idx = argv[0];
+        if (rb_obj_is_kind_of(idx, rb_cArray)) {
+            idx = rb_apply(numo_cNArray,id_bracket,idx);
+        }
+        if (rb_obj_is_kind_of(idx, numo_cNArray)) {
+            GetNArray(idx,nidx);
+            if (NA_NDIM(nidx)>1) {
+                store = nary_new(CLASS_OF(self),NA_NDIM(nidx),NA_SHAPE(nidx));
+                idx = na_flatten(idx);
+                RARRAY_ASET(args,0,idx);
+            }
+        }
+        // flatten should be done only for narray-view with non-uniform stride.
+        if (na1->ndim > 1) {
+            self = na_flatten(self);
+            GetNArray(self,na1);
+        }
+    }
+
+    data.args = args;
+    data.self = self;
+    data.store = store;
+    data.ndim = result_nd;
+    data.q = na_allocate_index_args(result_nd);
+    data.na1 = na1;
+    data.keep_dim = keep_dim;
+
+    return rb_ensure(na_aref_md_protected, (VALUE)&data, na_aref_md_ensure, (VALUE)&data);
+}
+
+
+/* method: [](idx1,idx2,...,idxN) */
+VALUE
+na_aref_main(int nidx, VALUE *idx, VALUE self, int keep_dim, int nd)
+{
+    na_index_arg_to_internal_order(nidx, idx, self);
+
+    if (nidx==0) {
+        return rb_funcall(self,id_dup,0);
+    }
+    if (nidx==1) {
+        if (CLASS_OF(*idx)==numo_cBit) {
+            return rb_funcall(*idx,id_mask,1,self);
+        }
+    }
+    return na_aref_md(nidx, idx, self, keep_dim, nd);
+}
+
+
+/* method: slice(idx1,idx2,...,idxN) */
+static VALUE na_slice(int argc, VALUE *argv, VALUE self)
+{
+    int nd;
+    size_t pos;
+
+    nd = na_get_result_dimension(self, argc, argv, 0, &pos);
+    return na_aref_main(argc, argv, self, 1, nd);
+}
+
+
+static int
+check_index_count(int argc, int na_ndim, int count_new, int count_rest)
+{
+    int result_nd = na_ndim + count_new;
+
+    switch(count_rest) {
+    case 0:
+        if (count_new == 0 && argc == 1) return 1;
+        if (argc == result_nd) return result_nd;
+        rb_raise(rb_eIndexError,"# of index(=%i) should be "
+                 "equal to ndim(=%i)",argc,na_ndim);
+        break;
+    case 1:
+        if (argc-1 <= result_nd) return result_nd;
+        rb_raise(rb_eIndexError,"# of index(=%i) > ndim(=%i) with :rest",
+                 argc,na_ndim);
+        break;
+    }
+    return -1;
+}
+
+int
+na_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride, size_t *pos_idx)
+{
+    int i, j;
+    int count_new=0;
+    int count_rest=0;
+    int count_else=0;
+    ssize_t x, s, m, pos, *idx;
+    narray_t *na;
+    narray_view_t *nv;
+    stridx_t sdx;
+    VALUE a;
+
+    GetNArray(self,na);
+    if (na->size == 0) {
+        rb_raise(rb_eRuntimeError, "cannot get index of empty array");
+        return -1;
+    }
+    idx = ALLOCA_N(ssize_t, argc);
+    for (i=j=0; i<argc; i++) {
+        a = argv[i];
+        switch(TYPE(a)) {
+        case T_FIXNUM:
+            idx[j++] = FIX2LONG(a);
+            break;
+        case T_BIGNUM:
+        case T_FLOAT:
+            idx[j++] = NUM2SSIZET(a);
+            break;
+        case T_FALSE:
+        case T_SYMBOL:
+            if (a==sym_rest || a==sym_tilde || a==Qfalse) {
+                argv[i] = Qfalse;
+                count_rest++;
+                break;
+            } else if (a==sym_new || a==sym_minus) {
+                argv[i] = sym_new;
+                count_new++;
+            }
+            // not break
+        default:
+            count_else++;
+        }
+    }
+
+    if (count_rest > 1) {
+        rb_raise(rb_eIndexError,"multiple rest-dimension is not allowd");
+    }
+    if (count_else != 0) {
+        return check_index_count(argc, na->ndim, count_new, count_rest);
+    }
+
+    switch(na->type) {
+    case NARRAY_VIEW_T:
+        GetNArrayView(self,nv);
+        pos = nv->offset;
+        if (j == na->ndim) {
+            for (i=j-1; i>=0; i--) {
+                x = na_range_check(idx[i], na->shape[i], i);
+                sdx = nv->stridx[i];
+                if (SDX_IS_INDEX(sdx)) {
+                    pos += SDX_GET_INDEX(sdx)[x];
+                } else {
+                    pos += SDX_GET_STRIDE(sdx)*x;
+                }
+            }
+            *pos_idx = pos;
+        }
+        else if (argc==1 && j==1) {
+            x = na_range_check(idx[0], na->size, 0);
+            for (i=na->ndim-1; i>=0; i--) {
+                s = na->shape[i];
+                m = x % s;
+                x = x / s;
+                sdx = nv->stridx[i];
+                if (SDX_IS_INDEX(sdx)) {
+                    pos += SDX_GET_INDEX(sdx)[m];
+                } else {
+                    pos += SDX_GET_STRIDE(sdx)*m;
+                }
+            }
+            *pos_idx = pos;
+        } else {
+            return check_index_count(argc, na->ndim, count_new, count_rest);
+        }
+        break;
+    default:
+        if (!stride) {
+            stride = nary_element_stride(self);
+        }
+        if (argc==1 && j==1) {
+            x = na_range_check(idx[0], na->size, 0);
+            *pos_idx = stride * x;
+        }
+        else if (j == na->ndim) {
+            pos = 0;
+            for (i=j-1; i>=0; i--) {
+                x = na_range_check(idx[i], na->shape[i], i);
+                pos += stride * x;
+                stride *= na->shape[i];
+            }
+            *pos_idx = pos;
+        } else {
+            return check_index_count(argc, na->ndim, count_new, count_rest);
+        }
+    }
+    return 0;
+}
+
+
+void
+Init_nary_index()
+{
+    rb_define_method(cNArray, "slice", na_slice, -1);
+
+    sym_ast        = ID2SYM(rb_intern("*"));
+    sym_all        = ID2SYM(rb_intern("all"));
+    sym_minus      = ID2SYM(rb_intern("-"));
+    sym_new        = ID2SYM(rb_intern("new"));
+    sym_reverse    = ID2SYM(rb_intern("reverse"));
+    sym_plus       = ID2SYM(rb_intern("+"));
+    //sym_reduce   = ID2SYM(rb_intern("reduce"));
+    sym_sum        = ID2SYM(rb_intern("sum"));
+    sym_tilde      = ID2SYM(rb_intern("~"));
+    sym_rest       = ID2SYM(rb_intern("rest"));
+    id_beg         = rb_intern("begin");
+    id_end         = rb_intern("end");
+    id_exclude_end = rb_intern("exclude_end?");
+    id_each        = rb_intern("each");
+    id_step        = rb_intern("step");
+    id_dup         = rb_intern("dup");
+    id_bracket     = rb_intern("[]");
+    id_shift_left  = rb_intern("<<");
+    id_mask        = rb_intern("mask");
+}
diff --git a/ext/numo/narray/math.c b/ext/numo/narray/math.c
new file mode 100644
index 0000000..ec5fda8
--- /dev/null
+++ b/ext/numo/narray/math.c
@@ -0,0 +1,147 @@
+/*
+  math.c
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+#include <ruby.h>
+#include "numo/narray.h"
+
+VALUE numo_mNMath;
+EXTERN VALUE numo_mDFloatMath, numo_mDComplexMath;
+EXTERN VALUE numo_mSFloatMath, numo_mSComplexMath;
+static ID id_send;
+static ID id_UPCAST;
+static ID id_DISPATCH;
+static ID id_extract;
+
+VALUE
+nary_type_s_upcast(VALUE type1, VALUE type2)
+{
+    VALUE upcast_hash;
+    VALUE result_type;
+
+    if (type1==type2) return type1;
+    upcast_hash = rb_const_get(type1, id_UPCAST);
+    result_type = rb_hash_aref(upcast_hash, type2);
+    if (NIL_P(result_type)) {
+        if (TYPE(type2)==T_CLASS) {
+            if ( RTEST(rb_class_inherited_p(type2,cNArray)) ) {
+                upcast_hash = rb_const_get(type2, id_UPCAST);
+                result_type = rb_hash_aref(upcast_hash, type1);
+            }
+        }
+    }
+    return result_type;
+}
+
+
+VALUE nary_math_cast2(VALUE type1, VALUE type2)
+{
+    if ( RTEST(rb_class_inherited_p( type1, cNArray )) ){
+	return nary_type_s_upcast( type1, type2 );
+    }
+    if ( RTEST(rb_class_inherited_p( type2, cNArray )) ){
+	return nary_type_s_upcast( type2, type1 );
+    }
+    if ( RTEST(rb_class_inherited_p( type1, rb_cNumeric )) &&
+	 RTEST(rb_class_inherited_p( type2, rb_cNumeric )) ){
+	if ( RTEST(rb_class_inherited_p( type1, rb_cComplex)) ||
+	     RTEST(rb_class_inherited_p( type2, rb_cComplex )) ){
+	    return rb_cComplex;
+	}
+	return rb_cFloat;
+    }
+    return type2;
+}
+
+
+VALUE na_ary_composition_dtype(VALUE);
+
+VALUE nary_mathcast(int argc, VALUE *argv)
+{
+    VALUE type, type2;
+    int i;
+
+    type = na_ary_composition_dtype(argv[0]);
+    for (i=1; i<argc; i++) {
+        type2 = na_ary_composition_dtype(argv[i]);
+        type = nary_math_cast2(type, type2);
+        if (NIL_P(type)) {
+            rb_raise(rb_eTypeError,"includes unknown DataType for upcast");
+        }
+    }
+    return type;
+}
+
+
+/*
+  Dispatches method to Math module of upcasted type,
+  eg, Numo::DFloat::Math.
+  @overload method_missing(name,x,...)
+  @param [Symbol] name  method name.
+  @param [NArray,Numeric] x  input array.
+  @return [NArray] result.
+*/
+VALUE nary_math_method_missing(int argc, VALUE *argv, VALUE mod)
+{
+    VALUE type, ans, typemod, hash;
+    if (argc>1) {
+	type = nary_mathcast(argc-1,argv+1);
+
+	hash = rb_const_get(mod, id_DISPATCH);
+	typemod = rb_hash_aref( hash, type );
+	if (NIL_P(typemod)) {
+	    rb_raise(rb_eTypeError,"%s is unknown for Numo::NMath",
+		     rb_class2name(type));
+	}
+
+	ans = rb_funcall2(typemod,id_send,argc,argv);
+
+	if (!RTEST(rb_class_inherited_p(type,cNArray)) &&
+	    IsNArray(ans) ) {
+	    ans = rb_funcall(ans,id_extract,0);
+	}
+	return ans;
+    }
+    rb_raise(rb_eArgError,"argument or method missing");
+    return Qnil;
+}
+
+
+void
+Init_nary_math()
+{
+    VALUE hCast;
+
+    numo_mNMath = rb_define_module_under(mNumo, "NMath");
+    rb_define_singleton_method(numo_mNMath, "method_missing", nary_math_method_missing, -1);
+
+    hCast = rb_hash_new();
+    rb_define_const(numo_mNMath, "DISPATCH", hCast);
+    rb_hash_aset(hCast, numo_cInt64,    numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cInt32,    numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cInt16,    numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cInt8,     numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cUInt64,   numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cUInt32,   numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cUInt16,   numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cUInt8,    numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cDFloat,   numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cDFloat,   numo_mDFloatMath);
+    rb_hash_aset(hCast, numo_cDComplex, numo_mDComplexMath);
+    rb_hash_aset(hCast, numo_cSFloat,   numo_mSFloatMath);
+    rb_hash_aset(hCast, numo_cSComplex, numo_mSComplexMath);
+#ifdef RUBY_INTEGER_UNIFICATION
+    rb_hash_aset(hCast, rb_cInteger, rb_mMath);
+#else
+    rb_hash_aset(hCast, rb_cFixnum,  rb_mMath);
+    rb_hash_aset(hCast, rb_cBignum,  rb_mMath);
+#endif
+    rb_hash_aset(hCast, rb_cFloat,   rb_mMath);
+    rb_hash_aset(hCast, rb_cComplex, numo_mDComplexMath);
+
+    id_send     = rb_intern("send");
+    id_UPCAST   = rb_intern("UPCAST");
+    id_DISPATCH = rb_intern("DISPATCH");
+    id_extract  = rb_intern("extract");
+}
diff --git a/ext/numo/narray/narray.c b/ext/numo/narray/narray.c
new file mode 100644
index 0000000..ab38812
--- /dev/null
+++ b/ext/numo/narray/narray.c
@@ -0,0 +1,1975 @@
+/*
+  narray.c
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+#define NARRAY_C
+#include <ruby.h>
+#include <assert.h>
+
+/* global variables within this module */
+VALUE numo_cNArray;
+VALUE rb_mNumo;
+VALUE nary_eCastError;
+VALUE nary_eShapeError;
+VALUE nary_eOperationError;
+VALUE nary_eDimensionError;
+
+static ID id_contiguous_stride;
+static ID id_allocate;
+static ID id_element_byte_size;
+static ID id_fill;
+static ID id_seq;
+static ID id_logseq;
+static ID id_eye;
+static ID id_UPCAST;
+static ID id_cast;
+static ID id_dup;
+static ID id_to_host;
+static ID id_bracket;
+static ID id_shift_left;
+static ID id_eq;
+static ID id_count_false;
+static ID id_axis;
+static ID id_nan;
+static ID id_keepdims;
+
+VALUE cPointer;
+
+VALUE sym_reduce;
+VALUE sym_option;
+VALUE sym_loop_opt;
+VALUE sym_init;
+
+VALUE na_cStep;
+#ifndef HAVE_RB_CCOMPLEX
+VALUE rb_cComplex;
+#endif
+
+int numo_na_inspect_rows=20;
+int numo_na_inspect_cols=80;
+
+void Init_nary_data();
+void Init_nary_ndloop();
+void Init_nary_step();
+void Init_nary_index();
+void Init_numo_bit();
+void Init_numo_int8();
+void Init_numo_int16();
+void Init_numo_int32();
+void Init_numo_int64();
+void Init_numo_uint8();
+void Init_numo_uint16();
+void Init_numo_uint32();
+void Init_numo_uint64();
+void Init_numo_sfloat();
+void Init_numo_scomplex();
+void Init_numo_dfloat();
+void Init_numo_dcomplex();
+void Init_numo_robject();
+void Init_nary_math();
+void Init_nary_rand();
+void Init_nary_array();
+void Init_nary_struct();
+
+const rb_data_type_t na_data_type = {
+    "Numo::NArray",
+    {0, 0, 0,}, 0, 0, 0,
+};
+#include "numo/narray.h"
+
+
+static void
+nary_debug_info_nadata(VALUE self)
+{
+    narray_data_t *na;
+    GetNArrayData(self,na);
+
+    printf("  ptr    = 0x%"SZF"x\n", (size_t)(na->ptr));
+}
+
+
+static VALUE
+nary_debug_info_naview(VALUE self)
+{
+    int i;
+    narray_view_t *na;
+    size_t *idx;
+    size_t j;
+    GetNArrayView(self,na);
+
+    printf("  data   = 0x%"SZF"x\n", (size_t)na->data);
+    printf("  offset = %"SZF"d\n", (size_t)na->offset);
+    printf("  stridx = 0x%"SZF"x\n", (size_t)na->stridx);
+
+    if (na->stridx) {
+        printf("  stridx = [");
+        for (i=0; i<na->base.ndim; i++) {
+            if (SDX_IS_INDEX(na->stridx[i])) {
+
+                idx = SDX_GET_INDEX(na->stridx[i]);
+                printf("  index[%d]=[", i);
+                for (j=0; j<na->base.shape[i]; j++) {
+                    printf(" %"SZF"d", idx[j]);
+                }
+                printf(" ] ");
+
+            } else {
+                printf(" %"SZF"d", SDX_GET_STRIDE(na->stridx[i]));
+            }
+        }
+        printf(" ]\n");
+    }
+    return Qnil;
+}
+
+
+VALUE
+nary_debug_info(VALUE self)
+{
+    int i;
+    narray_t *na;
+    GetNArray(self,na);
+
+    printf("%s:\n",rb_class2name(CLASS_OF(self)));
+    printf("  id     = 0x%"PRI_VALUE_PREFIX"x\n", self);
+    printf("  type   = %d\n", na->type);
+    printf("  flag   = [%d,%d]\n", na->flag[0], na->flag[1]);
+    printf("  size   = %"SZF"d\n", na->size);
+    printf("  ndim   = %d\n", na->ndim);
+    printf("  shape  = 0x%"SZF"x\n", (size_t)na->shape);
+    if (na->shape) {
+        printf("  shape  = [");
+        for (i=0;i<na->ndim;i++)
+            printf(" %"SZF"d", na->shape[i]);
+        printf(" ]\n");
+    }
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        nary_debug_info_nadata(self);
+        break;
+    case NARRAY_VIEW_T:
+        nary_debug_info_naview(self);
+        break;
+    }
+    return Qnil;
+}
+
+
+static size_t
+na_view_memsize(const void* ptr)
+{
+    int i;
+    size_t size = sizeof(narray_view_t);
+    const narray_view_t *na = ptr;
+
+    assert(na->base.type == NARRAY_VIEW_T);
+
+    if (na->stridx != NULL) {
+        for (i=0; i<na->base.ndim; i++) {
+            if (SDX_IS_INDEX(na->stridx[i])) {
+                size += sizeof(size_t) * na->base.shape[i];
+            }
+        }
+        size += sizeof(stridx_t) * na->base.ndim;
+    }
+    if (na->base.size > 0) {
+        if (na->base.shape != NULL && na->base.shape != &(na->base.size)) {
+            size += sizeof(size_t) * na->base.ndim;
+        }
+    }
+    return size;
+}
+
+static void
+na_view_free(void* ptr)
+{
+    int i;
+    narray_view_t *na = (narray_view_t*)ptr;
+
+    assert(na->base.type == NARRAY_VIEW_T);
+
+    if (na->stridx != NULL) {
+        for (i=0; i<na->base.ndim; i++) {
+            if (SDX_IS_INDEX(na->stridx[i])) {
+                xfree(SDX_GET_INDEX(na->stridx[i]));
+            }
+        }
+        xfree(na->stridx);
+        na->stridx = NULL;
+    }
+    if (na->base.size > 0) {
+        if (na->base.shape != NULL && na->base.shape != &(na->base.size)) {
+            xfree(na->base.shape);
+            na->base.shape = NULL;
+        }
+    }
+    xfree(na);
+}
+
+static void
+na_view_gc_mark(void* na)
+{
+    if (((narray_t*)na)->type == NARRAY_VIEW_T) {
+        rb_gc_mark(((narray_view_t*)na)->data);
+    }
+}
+
+const rb_data_type_t na_data_type_view = {
+    "Numo::NArrayView",
+    {na_view_gc_mark, na_view_free, na_view_memsize,},
+    &na_data_type, 0, 0,
+};
+
+VALUE
+na_s_allocate_view(VALUE klass)
+{
+    narray_view_t *na = ALLOC(narray_view_t);
+
+    na->base.ndim = 0;
+    na->base.type = NARRAY_VIEW_T;
+    na->base.flag[0] = NA_FL0_INIT;
+    na->base.flag[1] = NA_FL1_INIT;
+    na->base.size = 0;
+    na->base.shape = NULL;
+    na->base.reduce = INT2FIX(0);
+    na->data = Qnil;
+    na->offset = 0;
+    na->stridx = NULL;
+    return TypedData_Wrap_Struct(klass, &na_data_type_view, (void*)na);
+}
+
+
+//static const size_t zero=0;
+
+void
+na_array_to_internal_shape(VALUE self, VALUE ary, size_t *shape)
+{
+    size_t    i, n, c, s;
+    ssize_t   x;
+    VALUE     v;
+    int       flag = 0;
+
+    n = RARRAY_LEN(ary);
+
+    if (RTEST(self)) {
+        flag = TEST_COLUMN_MAJOR(self);
+    }
+    if (flag) {
+        c = n-1;
+        s = -1;
+    } else {
+        c = 0;
+        s = 1;
+    }
+    for (i=0; i<n; i++) {
+        v = RARRAY_AREF(ary,i);
+        x = NUM2SSIZET(v);
+        if (x < 0) {
+            rb_raise(rb_eArgError,"size must be non-negative");
+        }
+        shape[c] = x;
+        c += s;
+    }
+}
+
+
+
+void
+na_alloc_shape(narray_t *na, int ndim)
+{
+    na->ndim = ndim;
+    na->size = 0;
+    switch(ndim) {
+    case 0:
+    case 1:
+        na->shape = &(na->size);
+        break;
+    default:
+        if (ndim < 0) {
+            rb_raise(nary_eDimensionError,"ndim=%d is negative", ndim);
+        }
+        if (ndim > NA_MAX_DIMENSION) {
+            rb_raise(nary_eDimensionError,"ndim=%d is too many", ndim);
+        }
+        na->shape = ALLOC_N(size_t, ndim);
+    }
+}
+
+void
+na_setup_shape(narray_t *na, int ndim, size_t *shape)
+{
+    int i;
+    size_t size;
+
+    na_alloc_shape(na, ndim);
+
+    if (ndim==0) {
+        na->size = 1;
+    }
+    else if (ndim==1) {
+        na->size = shape[0];
+    }
+    else {
+        for (i=0, size=1; i<ndim; i++) {
+            na->shape[i] = shape[i];
+            size *= shape[i];
+        }
+        na->size = size;
+    }
+}
+
+static void
+na_setup(VALUE self, int ndim, size_t *shape)
+{
+    narray_t *na;
+    GetNArray(self,na);
+    na_setup_shape(na, ndim, shape);
+}
+
+
+/*
+  @overload initialize(shape)
+  @overload initialize(size0, size1, ...)
+  @param [Array] shape (array of sizes along each dimension)
+  @param [Integer] sizeN (size along Nth-dimension)
+  @return [Numo::NArray] unallocated narray.
+
+  Constructs an instance of NArray class using the given
+  and <i>shape</i> or <i>sizes</i>.
+  Note that NArray itself is an abstract super class and
+  not suitable to create instances.
+  Use Typed Subclasses of NArray (DFloat, Int32, etc) to create instances.
+  This method does not allocate memory for array data.
+  Memory is allocated on write method such as #fill, #store, #seq, etc.
+
+  @example
+    i = Numo::Int64.new([2,4,3])
+    #=> Numo::Int64#shape=[2,4,3](empty)
+
+    f = Numo::DFloat.new(3,4)
+    #=> Numo::DFloat#shape=[3,4](empty)
+
+    f.fill(2)
+    #=> Numo::DFloat#shape=[3,4]
+    # [[2, 2, 2, 2],
+    #  [2, 2, 2, 2],
+    #  [2, 2, 2, 2]]
+
+    x = Numo::NArray.new(5)
+    #=> in `new': allocator undefined for Numo::NArray (TypeError)
+    #   	from t.rb:9:in `<main>'
+
+*/
+static VALUE
+na_initialize(VALUE self, VALUE args)
+{
+    VALUE v;
+    size_t *shape=NULL;
+    int ndim;
+
+    if (RARRAY_LEN(args) == 1) {
+        v = RARRAY_AREF(args,0);
+        if (TYPE(v) != T_ARRAY) {
+            v = args;
+        }
+    } else {
+        v = args;
+    }
+    ndim = RARRAY_LEN(v);
+    if (ndim > NA_MAX_DIMENSION) {
+        rb_raise(rb_eArgError,"ndim=%d exceeds maximum dimension",ndim);
+    }
+    shape = ALLOCA_N(size_t, ndim);
+    // setup size_t shape[] from VALUE shape argument
+    na_array_to_internal_shape(self, v, shape);
+    na_setup(self, ndim, shape);
+
+    return self;
+}
+
+
+VALUE
+nary_new(VALUE klass, int ndim, size_t *shape)
+{
+    volatile VALUE obj;
+
+    obj = rb_funcall(klass, id_allocate, 0);
+    na_setup(obj, ndim, shape);
+    return obj;
+}
+
+
+VALUE
+nary_view_new(VALUE klass, int ndim, size_t *shape)
+{
+    volatile VALUE obj;
+
+    obj = na_s_allocate_view(klass);
+    na_setup(obj, ndim, shape);
+    return obj;
+}
+
+
+/*
+  Replaces the contents of self with the contents of other narray.
+  Used in dup and clone method.
+  @overload initialize_copy(other)
+  @param [Numo::NArray] other
+  @return [Numo::NArray] self
+ */
+static VALUE
+na_initialize_copy(VALUE self, VALUE orig)
+{
+    narray_t *na;
+    GetNArray(orig,na);
+
+    na_setup(self,NA_NDIM(na),NA_SHAPE(na));
+    na_store(self,orig);
+    na_copy_flags(orig,self);
+    return self;
+}
+
+
+/*
+ *  call-seq:
+ *     zeros(shape)  => narray
+ *     zeros(size1,size2,...)  => narray
+ *
+ *  Returns a zero-filled narray with <i>shape</i>.
+ *  This singleton method is valid not for NArray class itself
+ *  but for typed NArray subclasses, e.g., DFloat, Int64.
+ *  @example
+ *    a = Numo::DFloat.zeros(3,5)
+ *    => Numo::DFloat#shape=[3,5]
+ *    [[0, 0, 0, 0, 0],
+ *     [0, 0, 0, 0, 0],
+ *     [0, 0, 0, 0, 0]]
+ */
+static VALUE
+na_s_zeros(int argc, VALUE *argv, VALUE klass)
+{
+    VALUE obj;
+    obj = rb_class_new_instance(argc, argv, klass);
+    return rb_funcall(obj, id_fill, 1, INT2FIX(0));
+}
+
+
+/*
+ *  call-seq:
+ *     ones(shape)  => narray
+ *     ones(size1,size2,...)  => narray
+ *
+ *  Returns a one-filled narray with <i>shape</i>.
+ *  This singleton method is valid not for NArray class itself
+ *  but for typed NArray subclasses, e.g., DFloat, Int64.
+ *  @example
+ *    a = Numo::DFloat.ones(3,5)
+ *    => Numo::DFloat#shape=[3,5]
+ *    [[1, 1, 1, 1, 1],
+ *     [1, 1, 1, 1, 1],
+ *     [1, 1, 1, 1, 1]]
+ */
+static VALUE
+na_s_ones(int argc, VALUE *argv, VALUE klass)
+{
+    VALUE obj;
+    obj = rb_class_new_instance(argc, argv, klass);
+    return rb_funcall(obj, id_fill, 1, INT2FIX(1));
+}
+
+
+/*
+  Returns an array of N linearly spaced points between x1 and x2.
+  This singleton method is valid not for NArray class itself
+  but for typed NArray subclasses, e.g., DFloat, Int64.
+
+  @overload linspace(x1, x2, [n])
+  @param [Numeric] x1   The start value
+  @param [Numeric] x2   The end value
+  @param [Integer] n    The number of elements. (default is 100).
+  @return [Numo::NArray]  result array.
+
+  @example
+    a = Numo::DFloat.linspace(-5,5,7)
+    => Numo::DFloat#shape=[7]
+    [-5, -3.33333, -1.66667, 0, 1.66667, 3.33333, 5]
+ */
+static VALUE
+na_s_linspace(int argc, VALUE *argv, VALUE klass)
+{
+    VALUE obj, vx1, vx2, vstep, vsize;
+    double n;
+    int narg;
+
+    narg = rb_scan_args(argc,argv,"21",&vx1,&vx2,&vsize);
+    if (narg==3) {
+        n = NUM2DBL(vsize);
+    } else {
+        n = 100;
+        vsize = INT2FIX(100);
+    }
+
+    obj = rb_funcall(vx2, '-', 1, vx1);
+    vstep = rb_funcall(obj, '/', 1, DBL2NUM(n-1));
+
+    obj = rb_class_new_instance(1, &vsize, klass);
+    return rb_funcall(obj, id_seq, 2, vx1, vstep);
+}
+
+/*
+  Returns an array of N logarithmically spaced points between 10^a and 10^b.
+  This singleton method is valid not for NArray having +logseq+ method,
+  i.e., DFloat, SFloat, DComplex, and SComplex.
+
+  @overload logspace(a, b, [n, base])
+  @param [Numeric] a  The start value
+  @param [Numeric] b  The end value
+  @param [Integer] n  The number of elements. (default is 50)
+  @param [Numeric] base  The base of log space. (default is 10)
+  @return [Numo::NArray]  result array.
+
+  @example
+    Numo::DFloat.logspace(4,0,5,2)
+    => Numo::DFloat#shape=[5]
+       [16, 8, 4, 2, 1]
+    Numo::DComplex.logspace(0,1i*Math::PI,5,Math::E)
+    => Numo::DComplex#shape=[5]
+       [1+4.44659e-323i, 0.707107+0.707107i, 6.12323e-17+1i, -0.707107+0.707107i, ...]
+ */
+static VALUE
+na_s_logspace(int argc, VALUE *argv, VALUE klass)
+{
+    VALUE obj, vx1, vx2, vstep, vsize, vbase;
+    double n;
+
+    rb_scan_args(argc,argv,"22",&vx1,&vx2,&vsize,&vbase);
+    if (vsize == Qnil) {
+        vsize = INT2FIX(50);
+        n = 50;
+    } else {
+        n = NUM2DBL(vsize);
+    }
+    if (vbase == Qnil) {
+        vbase = DBL2NUM(10);
+    }
+
+    obj = rb_funcall(vx2, '-', 1, vx1);
+    vstep = rb_funcall(obj, '/', 1, DBL2NUM(n-1));
+
+    obj = rb_class_new_instance(1, &vsize, klass);
+    return rb_funcall(obj, id_logseq, 3, vx1, vstep, vbase);
+}
+
+
+/*
+  Returns a NArray with shape=(n,n) whose diagonal elements are 1, otherwise 0.
+  @overload  eye(n)
+  @param [Integer] n  Size of NArray. Creates 2-D NArray with shape=(n,n)
+  @return [Numo::NArray]  created NArray.
+  @example
+    a = Numo::DFloat.eye(3)
+    => Numo::DFloat#shape=[3,3]
+    [[1, 0, 0],
+     [0, 1, 0],
+     [0, 0, 1]]
+*/
+static VALUE
+na_s_eye(int argc, VALUE *argv, VALUE klass)
+{
+    VALUE obj;
+    VALUE tmp[2];
+
+    if (argc==0) {
+        rb_raise(rb_eArgError,"No argument");
+    }
+    else if (argc==1) {
+        tmp[0] = tmp[1] = argv[0];
+        argv = tmp;
+        argc = 2;
+    }
+    obj = rb_class_new_instance(argc, argv, klass);
+    return rb_funcall(obj, id_eye, 0);
+}
+
+
+
+#define READ 1
+#define WRITE 2
+
+static char *
+na_get_pointer_for_rw(VALUE self, int flag)
+{
+    char *ptr;
+    VALUE obj;
+    narray_t *na;
+
+    if ((flag & WRITE) && OBJ_FROZEN(self)) {
+        rb_raise(rb_eRuntimeError, "cannot write to frozen NArray.");
+    }
+
+    GetNArray(self,na);
+
+    switch(NA_TYPE(na)) {
+    case NARRAY_DATA_T:
+        ptr = NA_DATA_PTR(na);
+        if (NA_SIZE(na) > 0 && ptr == NULL) {
+            if (flag & READ) {
+                rb_raise(rb_eRuntimeError,"cannot read unallocated NArray");
+            }
+            if (flag & WRITE) {
+                rb_funcall(self, id_allocate, 0);
+                ptr = NA_DATA_PTR(na);
+            }
+        }
+        return ptr;
+    case NARRAY_VIEW_T:
+        obj = NA_VIEW_DATA(na);
+        if ((flag & WRITE) && OBJ_FROZEN(obj)) {
+            rb_raise(rb_eRuntimeError, "cannot write to frozen NArray.");
+        }
+
+        if (flag & WRITE) {
+            if (OBJ_FROZEN(obj)) {
+                rb_raise(rb_eRuntimeError, "cannot write to frozen NArray.");
+            }
+        }
+        GetNArray(obj,na);
+        switch(NA_TYPE(na)) {
+        case NARRAY_DATA_T:
+            ptr = NA_DATA_PTR(na);
+            if (flag & (READ|WRITE)) {
+                if (NA_SIZE(na) > 0 && ptr == NULL) {
+                    rb_raise(rb_eRuntimeError,"cannot read/write unallocated NArray");
+                }
+            }
+            return ptr;
+        default:
+            rb_raise(rb_eRuntimeError,"invalid NA_TYPE of view: %d",NA_TYPE(na));
+        }
+    default:
+        rb_raise(rb_eRuntimeError,"invalid NA_TYPE: %d",NA_TYPE(na));
+    }
+
+    return NULL;
+}
+
+char *
+na_get_pointer_for_read(VALUE self)
+{
+    return na_get_pointer_for_rw(self, READ);
+}
+
+char *
+na_get_pointer_for_write(VALUE self)
+{
+    return na_get_pointer_for_rw(self, WRITE);
+}
+
+char *
+na_get_pointer_for_read_write(VALUE self)
+{
+    return na_get_pointer_for_rw(self, READ|WRITE);
+}
+
+char *
+na_get_pointer(VALUE self)
+{
+    return na_get_pointer_for_rw(self, 0);
+}
+
+
+void
+na_release_lock(VALUE self)
+{
+    narray_t *na;
+
+    UNSET_LOCK(self);
+    GetNArray(self,na);
+
+    switch(NA_TYPE(na)) {
+    case NARRAY_VIEW_T:
+        na_release_lock(NA_VIEW_DATA(na));
+        break;
+    }
+}
+
+
+/* method: size() -- returns the total number of typeents */
+static VALUE
+na_size(VALUE self)
+{
+    narray_t *na;
+    GetNArray(self,na);
+    return SIZET2NUM(na->size);
+}
+
+
+/* method: size() -- returns the total number of typeents */
+static VALUE
+na_ndim(VALUE self)
+{
+    narray_t *na;
+    GetNArray(self,na);
+    return INT2NUM(na->ndim);
+}
+
+
+/*
+  Returns true if self.size == 0.
+  @overload empty?
+*/
+static VALUE
+na_empty_p(VALUE self)
+{
+    narray_t *na;
+    GetNArray(self,na);
+    if (NA_SIZE(na)==0) {
+        return Qtrue;
+    }
+    return Qfalse;
+}
+
+
+/* method: shape() -- returns shape, array of the size of dimensions */
+static VALUE
+ na_shape(VALUE self)
+{
+    volatile VALUE v;
+    narray_t *na;
+    size_t i, n, c, s;
+
+    GetNArray(self,na);
+    n = NA_NDIM(na);
+    if (TEST_COLUMN_MAJOR(self)) {
+        c = n-1;
+        s = -1;
+    } else {
+        c = 0;
+        s = 1;
+    }
+    v = rb_ary_new2(n);
+    for (i=0; i<n; i++) {
+        rb_ary_push(v, SIZET2NUM(na->shape[c]));
+        c += s;
+    }
+    return v;
+}
+
+
+unsigned int
+nary_element_stride(VALUE v)
+{
+    narray_type_info_t *info;
+    narray_t *na;
+
+    GetNArray(v,na);
+    if (na->type == NARRAY_VIEW_T) {
+        v = NA_VIEW_DATA(na);
+        GetNArray(v,na);
+    }
+    assert(na->type == NARRAY_DATA_T);
+
+    info = (narray_type_info_t *)(RTYPEDDATA_TYPE(v)->data);
+    return info->element_stride;
+}
+
+size_t
+na_dtype_elmsz(VALUE klass)
+{
+    return NUM2SIZET(rb_const_get(klass, id_contiguous_stride));
+}
+
+size_t
+na_get_offset(VALUE self)
+{
+    narray_t *na;
+    GetNArray(self,na);
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        return 0;
+    case NARRAY_VIEW_T:
+        return NA_VIEW_OFFSET(na);
+    }
+    return 0;
+}
+
+
+void
+na_index_arg_to_internal_order(int argc, VALUE *argv, VALUE self)
+{
+    int i,j;
+    VALUE tmp;
+
+    if (TEST_COLUMN_MAJOR(self)) {
+        for (i=0,j=argc-1; i<argc/2; i++,j--) {
+            tmp = argv[i];
+            argv[i] = argv[j];
+            argv[j] = tmp;
+        }
+    }
+}
+
+void
+na_copy_flags(VALUE src, VALUE dst)
+{
+    narray_t *na1, *na2;
+
+    GetNArray(src,na1);
+    GetNArray(dst,na2);
+
+    na2->flag[0] = na1->flag[0];
+    //na2->flag[1] = NA_FL1_INIT;
+
+    RBASIC(dst)->flags |= (RBASIC(src)->flags) &
+        (FL_USER1|FL_USER2|FL_USER3|FL_USER4|FL_USER5|FL_USER6|FL_USER7);
+}
+
+
+// fix name, ex, allow_stride_for_flatten_view
+VALUE
+na_check_ladder(VALUE self, int start_dim)
+{
+    int i;
+    ssize_t st0, st1;
+    narray_t *na;
+    GetNArray(self,na);
+
+    if (start_dim < -na->ndim || start_dim >= na->ndim) {
+        rb_bug("start_dim (%d) out of range",start_dim);
+    }
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        return Qtrue;
+    case NARRAY_VIEW_T:
+        // negative dim -> position from last dim
+        if (start_dim < 0) {
+            start_dim += NA_NDIM(na);
+        }
+        // not ladder if it has index
+        for (i=start_dim; i<NA_NDIM(na); i++) {
+            if (NA_IS_INDEX_AT(na,i))
+                return Qfalse;
+        }
+        // check stride
+        st0 = NA_STRIDE_AT(na,start_dim);
+        for (i=start_dim+1; i<NA_NDIM(na); i++) {
+            st1 = NA_STRIDE_AT(na,i);
+            if (st0 != (ssize_t)(st1 * NA_SHAPE(na)[i])) {
+                return Qfalse;
+            }
+            st0 = st1;
+        }
+    }
+    return Qtrue;
+}
+
+VALUE
+na_check_contiguous(VALUE self)
+{
+    ssize_t elmsz;
+    narray_t *na;
+    GetNArray(self,na);
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        return Qtrue;
+    case NARRAY_VIEW_T:
+        if (NA_VIEW_STRIDX(na)==0) {
+            return Qtrue;
+        }
+        if (na_check_ladder(self,0)==Qtrue) {
+            elmsz = nary_element_stride(self);
+            if (elmsz == NA_STRIDE_AT(na,NA_NDIM(na)-1)) {
+                return Qtrue;
+            }
+        }
+    }
+    return Qfalse;
+}
+
+//----------------------------------------------------------------------
+
+/*
+ *  call-seq:
+ *     narray.view => narray
+ *
+ *  Return view of NArray
+ */
+VALUE
+na_make_view(VALUE self)
+{
+    int i, nd;
+    size_t  j;
+    size_t *idx1, *idx2;
+    ssize_t stride;
+    narray_t *na;
+    narray_view_t *na1, *na2;
+    volatile VALUE view;
+
+    GetNArray(self,na);
+    nd = na->ndim;
+
+    view = na_s_allocate_view(CLASS_OF(self));
+
+    na_copy_flags(self, view);
+    GetNArrayView(view, na2);
+
+    na_setup_shape((narray_t*)na2, nd, na->shape);
+    na2->stridx = ALLOC_N(stridx_t,nd);
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        stride = nary_element_stride(self);
+        for (i=nd; i--;) {
+            SDX_SET_STRIDE(na2->stridx[i],stride);
+            stride *= na->shape[i];
+        }
+        na2->offset = 0;
+        na2->data = self;
+        break;
+    case NARRAY_VIEW_T:
+        GetNArrayView(self, na1);
+        for (i=0; i<nd; i++) {
+            if (SDX_IS_INDEX(na1->stridx[i])) {
+                idx1 = SDX_GET_INDEX(na1->stridx[i]);
+                idx2 = ALLOC_N(size_t,na1->base.shape[i]);
+                for (j=0; j<na1->base.shape[i]; j++) {
+                    idx2[j] = idx1[j];
+                }
+                SDX_SET_INDEX(na2->stridx[i],idx2);
+            } else {
+                na2->stridx[i] = na1->stridx[i];
+            }
+        }
+        na2->offset = na1->offset;
+        na2->data = na1->data;
+        break;
+    }
+
+    return view;
+}
+
+
+//----------------------------------------------------------------------
+
+/*
+ *  call-seq:
+ *     narray.expand_dims(dim) => narray view
+ *
+ *  Expand the shape of an array. Insert a new axis with size=1
+ *  at a given dimension.
+ *  @param [Integer] dim  dimension at which new axis is inserted.
+ *  @return [Numo::NArray]  result narray view.
+ */
+VALUE
+na_expand_dims(VALUE self, VALUE vdim)
+{
+    int  i, j, nd, dim;
+    size_t *shape, *na_shape;
+    stridx_t *stridx, *na_stridx;
+    narray_t *na;
+    narray_view_t *na2;
+    VALUE view;
+
+    GetNArray(self,na);
+    nd = na->ndim;
+
+    dim = NUM2INT(vdim);
+    if (dim < -nd-1 || dim > nd) {
+        rb_raise(nary_eDimensionError,"invalid axis (%d for %dD NArray)",
+                 dim,nd);
+    }
+    if (dim < 0) {
+        dim += nd+1;
+    }
+
+    view = na_make_view(self);
+    GetNArrayView(view, na2);
+
+    shape = ALLOC_N(size_t,nd+1);
+    stridx = ALLOC_N(stridx_t,nd+1);
+    na_shape = na2->base.shape;
+    na_stridx = na2->stridx;
+
+    for (i=j=0; i<=nd; i++) {
+        if (i==dim) {
+            shape[i] = 1;
+            SDX_SET_STRIDE(stridx[i],0);
+        } else {
+            shape[i] = na_shape[j];
+            stridx[i] = na_stridx[j];
+            j++;
+        }
+    }
+
+    na2->stridx = stridx;
+    xfree(na_stridx);
+    na2->base.shape = shape;
+    if (na_shape != &(na2->base.size)) {
+        xfree(na_shape);
+    }
+    na2->base.ndim++;
+    return view;
+}
+
+//----------------------------------------------------------------------
+
+/*
+ *  call-seq:
+ *     narray.reverse([dim0,dim1,..]) => narray
+ *
+ *  Return reversed view along specified dimeinsion
+ */
+VALUE
+nary_reverse(int argc, VALUE *argv, VALUE self)
+{
+    int i, nd;
+    size_t  j, n;
+    size_t  offset;
+    size_t *idx1, *idx2;
+    ssize_t stride;
+    ssize_t sign;
+    narray_t *na;
+    narray_view_t *na1, *na2;
+    VALUE view;
+    VALUE reduce;
+
+    reduce = na_reduce_dimension(argc, argv, 1, &self, 0, 0);
+
+    GetNArray(self,na);
+    nd = na->ndim;
+
+    view = na_s_allocate_view(CLASS_OF(self));
+
+    na_copy_flags(self, view);
+    GetNArrayView(view, na2);
+
+    na_setup_shape((narray_t*)na2, nd, na->shape);
+    na2->stridx = ALLOC_N(stridx_t,nd);
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        stride = nary_element_stride(self);
+        offset = 0;
+        for (i=nd; i--;) {
+            if (na_test_reduce(reduce,i)) {
+                offset += (na->shape[i]-1)*stride;
+                sign = -1;
+            } else {
+                sign = 1;
+            }
+            SDX_SET_STRIDE(na2->stridx[i],stride*sign);
+            stride *= na->shape[i];
+        }
+        na2->offset = offset;
+        na2->data = self;
+        break;
+    case NARRAY_VIEW_T:
+        GetNArrayView(self, na1);
+        offset = na1->offset;
+        for (i=0; i<nd; i++) {
+            n = na1->base.shape[i];
+            if (SDX_IS_INDEX(na1->stridx[i])) {
+                idx1 = SDX_GET_INDEX(na1->stridx[i]);
+                idx2 = ALLOC_N(size_t,n);
+                if (na_test_reduce(reduce,i)) {
+                    for (j=0; j<n; j++) {
+                        idx2[n-1-j] = idx1[j];
+                    }
+                } else {
+                    for (j=0; j<n; j++) {
+                        idx2[j] = idx1[j];
+                    }
+                }
+                SDX_SET_INDEX(na2->stridx[i],idx2);
+            } else {
+                stride = SDX_GET_STRIDE(na1->stridx[i]);
+                if (na_test_reduce(reduce,i)) {
+                    offset += (n-1)*stride;
+                    SDX_SET_STRIDE(na2->stridx[i],-stride);
+                } else {
+                    na2->stridx[i] = na1->stridx[i];
+                }
+            }
+        }
+        na2->offset = offset;
+        na2->data = na1->data;
+        break;
+    }
+
+    return view;
+}
+
+//----------------------------------------------------------------------
+
+VALUE
+numo_na_upcast(VALUE type1, VALUE type2)
+{
+    VALUE upcast_hash;
+    VALUE result_type;
+
+    if (type1==type2) {
+        return type1;
+    }
+    upcast_hash = rb_const_get(type1, id_UPCAST);
+    result_type = rb_hash_aref(upcast_hash, type2);
+    if (NIL_P(result_type)) {
+        if (TYPE(type2)==T_CLASS) {
+            if (RTEST(rb_class_inherited_p(type2,cNArray))) {
+                upcast_hash = rb_const_get(type2, id_UPCAST);
+                result_type = rb_hash_aref(upcast_hash, type1);
+            }
+        }
+    }
+    return result_type;
+}
+
+/*
+  Returns an array containing other and self,
+  both are converted to upcasted type of NArray.
+  Note that NArray has distinct UPCAST mechanism.
+  Coerce is used for operation between non-NArray and NArray.
+  @overload coerce(other)
+  @param [Object] other  numeric object.
+  @return [Array]  NArray-casted [other,self]
+*/
+static VALUE
+nary_coerce(VALUE x, VALUE y)
+{
+    VALUE type;
+
+    type = numo_na_upcast(CLASS_OF(x), CLASS_OF(y));
+    y = rb_funcall(type,id_cast,1,y);
+    return rb_assoc_new(y , x);
+}
+
+
+/*
+  Returns total byte size of NArray.
+  @return [Integer] byte size.
+ */
+static VALUE
+nary_byte_size(VALUE self)
+{
+    VALUE velmsz;
+    narray_t *na;
+
+    GetNArray(self,na);
+    velmsz = rb_const_get(CLASS_OF(self), id_element_byte_size);
+    if (FIXNUM_P(velmsz)) {
+        return SIZET2NUM(NUM2SIZET(velmsz) * na->size);
+    }
+    return SIZET2NUM(ceil(NUM2DBL(velmsz) * na->size));
+}
+
+/*
+  Returns byte size of one element of NArray.
+  @return [Numeric] byte size.
+ */
+static VALUE
+nary_s_byte_size(VALUE type)
+{
+    return rb_const_get(type, id_element_byte_size);
+}
+
+
+/*
+  Returns a new 1-D array initialized from binary raw data in a string.
+  @overload from_binary(string,[shape])
+  @param [String] string  Binary raw data.
+  @param [Array] shape  array of integers representing array shape.
+  @return [Numo::NArray] NArray containing binary data.
+ */
+static VALUE
+nary_s_from_binary(int argc, VALUE *argv, VALUE type)
+{
+    size_t len, str_len, byte_size;
+    size_t *shape;
+    char *ptr;
+    int   i, nd, narg;
+    VALUE vstr, vshape, vna;
+    VALUE velmsz;
+
+    narg = rb_scan_args(argc,argv,"11",&vstr,&vshape);
+    Check_Type(vstr,T_STRING);
+    str_len = RSTRING_LEN(vstr);
+    velmsz = rb_const_get(type, id_element_byte_size);
+    if (narg==2) {
+        switch(TYPE(vshape)) {
+        case T_FIXNUM:
+            nd = 1;
+            len = NUM2SIZET(vshape);
+            shape = &len;
+            break;
+        case T_ARRAY:
+            nd = RARRAY_LEN(vshape);
+            if (nd == 0 || nd > NA_MAX_DIMENSION) {
+                rb_raise(nary_eDimensionError,"too long or empty shape (%d)", nd);
+            }
+            shape = ALLOCA_N(size_t,nd);
+            len = 1;
+            for (i=0; i<nd; ++i) {
+                len *= shape[i] = NUM2SIZET(RARRAY_AREF(vshape,i));
+            }
+            break;
+        default:
+            rb_raise(rb_eArgError,"second argument must be size or shape");
+        }
+        if (FIXNUM_P(velmsz)) {
+            byte_size = len * NUM2SIZET(velmsz);
+        } else {
+            byte_size = ceil(len * NUM2DBL(velmsz));
+        }
+        if (byte_size > str_len) {
+            rb_raise(rb_eArgError, "specified size is too large");
+        }
+    } else {
+        nd = 1;
+        if (FIXNUM_P(velmsz)) {
+            len = str_len / NUM2SIZET(velmsz);
+            byte_size = len * NUM2SIZET(velmsz);
+        } else {
+            len = floor(str_len / NUM2DBL(velmsz));
+            byte_size = str_len;
+        }
+        if (len == 0) {
+            rb_raise(rb_eArgError, "string is empty or too short");
+        }
+        shape = ALLOCA_N(size_t,nd);
+        shape[0] = len;
+    }
+
+    vna = nary_new(type, nd, shape);
+    ptr = na_get_pointer_for_write(vna);
+
+    memcpy(ptr, RSTRING_PTR(vstr), byte_size);
+
+    return vna;
+}
+
+/*
+  Returns a new 1-D array initialized from binary raw data in a string.
+  @overload store_binary(string,[offset])
+  @param [String] string  Binary raw data.
+  @param [Integer] (optional) offset  Byte offset in string.
+  @return [Integer] stored length.
+ */
+static VALUE
+nary_store_binary(int argc, VALUE *argv, VALUE self)
+{
+    size_t size, str_len, byte_size, offset;
+    char *ptr;
+    int   narg;
+    VALUE vstr, voffset;
+    VALUE velmsz;
+    narray_t *na;
+
+    narg = rb_scan_args(argc,argv,"11",&vstr,&voffset);
+    str_len = RSTRING_LEN(vstr);
+    if (narg==2) {
+        offset = NUM2SIZET(voffset);
+        if (str_len < offset) {
+            rb_raise(rb_eArgError, "offset is larger than string length");
+        }
+        str_len -= offset;
+    } else {
+        offset = 0;
+    }
+
+    GetNArray(self,na);
+    size = NA_SIZE(na);
+    velmsz = rb_const_get(CLASS_OF(self), id_element_byte_size);
+    if (FIXNUM_P(velmsz)) {
+        byte_size = size * NUM2SIZET(velmsz);
+    } else {
+        byte_size = ceil(size * NUM2DBL(velmsz));
+    }
+    if (byte_size > str_len) {
+        rb_raise(rb_eArgError, "string is too short to store");
+    }
+
+    ptr = na_get_pointer_for_write(self);
+    memcpy(ptr, RSTRING_PTR(vstr)+offset, byte_size);
+
+    return SIZET2NUM(byte_size);
+}
+
+/*
+  Returns string containing the raw data bytes in NArray.
+  @overload to_binary()
+  @return [String] String object containing binary raw data.
+ */
+static VALUE
+nary_to_binary(VALUE self)
+{
+    size_t len, offset=0;
+    char *ptr;
+    VALUE str;
+    narray_t *na;
+
+    GetNArray(self,na);
+    if (na->type == NARRAY_VIEW_T) {
+        if (na_check_contiguous(self)==Qtrue) {
+            offset = NA_VIEW_OFFSET(na);
+        } else {
+            self = rb_funcall(self,id_dup,0);
+        }
+    }
+    len = NUM2SIZET(nary_byte_size(self));
+    ptr = na_get_pointer_for_read(self);
+    str = rb_usascii_str_new(ptr+offset,len);
+    RB_GC_GUARD(self);
+    return str;
+}
+
+/*
+  Dump marshal data.
+  @overload marshal_dump()
+  @return [Array] Array containing marshal data.
+ */
+static VALUE
+nary_marshal_dump(VALUE self)
+{
+    VALUE a;
+
+    a = rb_ary_new();
+    rb_ary_push(a, INT2FIX(1));     // version
+    rb_ary_push(a, na_shape(self));
+    rb_ary_push(a, INT2FIX(NA_FLAG0(self)));
+    if (CLASS_OF(self) == numo_cRObject) {
+        narray_t *na;
+        VALUE *ptr;
+        size_t offset=0;
+        GetNArray(self,na);
+        if (na->type == NARRAY_VIEW_T) {
+            if (na_check_contiguous(self)==Qtrue) {
+                offset = NA_VIEW_OFFSET(na);
+            } else {
+                self = rb_funcall(self,id_dup,0);
+            }
+        }
+        ptr = (VALUE*)na_get_pointer_for_read(self);
+        rb_ary_push(a, rb_ary_new4(NA_SIZE(na), ptr+offset));
+    } else {
+        rb_ary_push(a, nary_to_binary(self));
+    }
+    RB_GC_GUARD(self);
+    return a;
+}
+
+VALUE na_inplace( VALUE self );
+/*
+  Load marshal data.
+  @overload marshal_load(data)
+  @params [Array] Array containing marshal data.
+  @return [nil]
+ */
+static VALUE
+nary_marshal_load(VALUE self, VALUE a)
+{
+    VALUE v;
+
+    if (TYPE(a) != T_ARRAY) {
+        rb_raise(rb_eArgError,"marshal argument should be array");
+    }
+    if (RARRAY_LEN(a) != 4) {
+        rb_raise(rb_eArgError,"marshal array size should be 4");
+    }
+    if (RARRAY_AREF(a,0) != INT2FIX(1)) {
+        rb_raise(rb_eArgError,"NArray marshal version %d is not supported "
+                 "(only version 1)", NUM2INT(RARRAY_AREF(a,0)));
+    }
+    na_initialize(self,RARRAY_AREF(a,1));
+    NA_FL0_SET(self,FIX2INT(RARRAY_AREF(a,2)));
+    v = RARRAY_AREF(a,3);
+    if (CLASS_OF(self) == numo_cRObject) {
+        narray_t *na;
+        char *ptr;
+        if (TYPE(v) != T_ARRAY) {
+            rb_raise(rb_eArgError,"RObject content should be array");
+        }
+        GetNArray(self,na);
+        if (RARRAY_LEN(v) != (long)NA_SIZE(na)) {
+            rb_raise(rb_eArgError,"RObject content size mismatch");
+        }
+        ptr = na_get_pointer_for_write(self);
+        memcpy(ptr, RARRAY_PTR(v), NA_SIZE(na)*sizeof(VALUE));
+    } else {
+        nary_store_binary(1,&v,self);
+        if (TEST_BYTE_SWAPPED(self)) {
+            rb_funcall(na_inplace(self),id_to_host,0);
+            REVERSE_ENDIAN(self); // correct behavior??
+        }
+    }
+    RB_GC_GUARD(a);
+    return self;
+}
+
+
+/*
+  Cast self to another NArray datatype.
+  @overload cast_to(datatype)
+  @param [Class] datatype NArray datatype.
+  @return [Numo::NArray]
+ */
+static VALUE
+nary_cast_to(VALUE obj, VALUE type)
+{
+    return rb_funcall(type, id_cast, 1, obj);
+}
+
+
+
+bool
+na_test_reduce(VALUE reduce, int dim)
+{
+    size_t m;
+
+    if (!RTEST(reduce))
+        return 0;
+    if (FIXNUM_P(reduce)) {
+        m = FIX2LONG(reduce);
+        if (m==0) return 1;
+        return (m & (1u<<dim)) ? 1 : 0;
+    } else {
+        return (rb_funcall(reduce,id_bracket,1,INT2FIX(dim))==INT2FIX(1)) ?
+            1 : 0 ;
+    }
+}
+
+
+static VALUE
+na_get_reduce_flag_from_narray(int naryc, VALUE *naryv, int *max_arg)
+{
+    int ndim, ndim0;
+    int rowmaj;
+    int i;
+    size_t j;
+    narray_t *na;
+    VALUE reduce;
+
+    if (naryc<1) {
+        rb_raise(rb_eRuntimeError,"must be positive: naryc=%d", naryc);
+    }
+    GetNArray(naryv[0],na);
+    if (na->size==0) {
+        rb_raise(nary_eShapeError,"cannot reduce empty NArray");
+    }
+    reduce = na->reduce;
+    ndim = ndim0 = na->ndim;
+    if (max_arg) *max_arg = 0;
+    rowmaj = TEST_COLUMN_MAJOR(naryv[0]);
+    for (i=0; i<naryc; i++) {
+        GetNArray(naryv[i],na);
+        if (na->size==0) {
+            rb_raise(nary_eShapeError,"cannot reduce empty NArray");
+        }
+        if (TEST_COLUMN_MAJOR(naryv[i]) != rowmaj) {
+            rb_raise(nary_eDimensionError,"dimension order is different");
+        }
+        if (na->ndim > ndim) { // maximum dimension
+            ndim = na->ndim;
+            if (max_arg) *max_arg = i;
+        }
+    }
+    if (ndim != ndim0) {
+        j = NUM2SIZET(reduce) << (ndim-ndim0);
+        reduce = SIZET2NUM(j);
+    }
+    return reduce;
+}
+
+
+static VALUE
+na_get_reduce_flag_from_axes(VALUE na_obj, VALUE axes)
+{
+    int i, r;
+    int ndim, rowmaj;
+    long narg;
+    size_t j;
+    size_t len;
+    ssize_t beg, step;
+    VALUE v;
+    size_t m;
+    VALUE reduce;
+    narray_t *na;
+
+    GetNArray(na_obj,na);
+    ndim = na->ndim;
+    rowmaj = TEST_COLUMN_MAJOR(na_obj);
+
+    m = 0;
+    reduce = Qnil;
+    narg = RARRAY_LEN(axes);
+    for (i=0; i<narg; i++) {
+        v = RARRAY_AREF(axes,i);
+        //printf("argv[%d]=",i);rb_p(v);
+        if (TYPE(v)==T_FIXNUM) {
+            beg = FIX2INT(v);
+            if (beg<0) beg+=ndim;
+            if (beg>=ndim || beg<0) {
+                rb_raise(nary_eDimensionError,"dimension is out of range");
+            }
+            len = 1;
+            step = 0;
+            //printf("beg=%d step=%d len=%d\n",beg,step,len);
+        } else if (rb_obj_is_kind_of(v,rb_cRange) ||
+                   rb_obj_is_kind_of(v,na_cStep)) {
+            nary_step_array_index( v, ndim, &len, &beg, &step );
+        } else {
+            rb_raise(nary_eDimensionError, "invalid dimension argument %s",
+                     rb_obj_classname(v));
+        }
+        for (j=0; j<len; j++) {
+            r = beg + step*j;
+            if (rowmaj) {
+                r = ndim-1-r;
+            }
+            if (reduce==Qnil) {
+              if ( r < (ssize_t)sizeof(size_t) ) {
+                    m |= ((size_t)1) << r;
+                    continue;
+                } else {
+                    reduce = SIZET2NUM(m);
+                }
+            }
+            v = rb_funcall( INT2FIX(1), id_shift_left, 1, INT2FIX(r) );
+            reduce = rb_funcall( reduce, '|', 1, v );
+        }
+    }
+    if (NIL_P(reduce)) reduce = SIZET2NUM(m);
+    return reduce;
+}
+
+
+VALUE
+nary_reduce_options(VALUE axes, VALUE *opts, int naryc, VALUE *naryv,
+                    ndfunc_t *ndf)
+{
+    int  max_arg;
+    VALUE reduce;
+
+    // option: axis
+    if (opts[0] != Qundef && RTEST(opts[0])) {
+        if (!NIL_P(axes)) {
+            rb_raise(rb_eArgError,
+              "cannot specify axis-arguments and axis-keyword simultaneously");
+        }
+        if (TYPE(opts[0]) == T_ARRAY) {
+            axes = opts[0];
+        } else {
+            axes = rb_ary_new3(1,opts[0]);
+        }
+    }
+    if (ndf) {
+        // option: keepdims
+        if (opts[1] != Qundef) {
+            if (RTEST(opts[1]))
+                ndf->flag |= NDF_KEEP_DIM;
+        }
+    }
+
+    reduce = na_get_reduce_flag_from_narray(naryc, naryv, &max_arg);
+
+    if (NIL_P(axes)) return reduce;
+
+    return na_get_reduce_flag_from_axes(naryv[max_arg], axes);
+}
+
+
+VALUE
+nary_reduce_dimension(int argc, VALUE *argv, int naryc, VALUE *naryv,
+                      ndfunc_t *ndf, na_iter_func_t iter_nan)
+{
+    long narg;
+    VALUE axes;
+    VALUE kw_hash = Qnil;
+    ID kw_table[3] = {id_axis,id_keepdims,id_nan};
+    VALUE opts[3] = {Qundef,Qundef,Qundef};
+
+    narg = rb_scan_args(argc, argv, "*:", &axes, &kw_hash);
+    rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
+
+    if (ndf) {
+        // option: nan
+        if (iter_nan && opts[2] != Qundef) {
+            if (RTEST(opts[2]))
+                ndf->func = iter_nan; // replace to nan-aware iterator function
+        }
+    }
+
+    return na_reduce_options((narg)?axes:Qnil, opts, naryc, naryv, ndf);
+}
+
+/*
+  Return true if column major.
+*/
+VALUE na_column_major_p( VALUE self )
+{
+    if (TEST_COLUMN_MAJOR(self))
+	return Qtrue;
+    else
+	return Qfalse;
+}
+
+/*
+  Return true if row major.
+*/
+VALUE na_row_major_p( VALUE self )
+{
+    if (TEST_ROW_MAJOR(self))
+	return Qtrue;
+    else
+	return Qfalse;
+}
+
+
+/*
+  Return true if byte swapped.
+*/
+VALUE na_byte_swapped_p( VALUE self )
+{
+    if (TEST_BYTE_SWAPPED(self))
+      return Qtrue;
+    return Qfalse;
+}
+
+/*
+  Return true if not byte swapped.
+*/
+VALUE na_host_order_p( VALUE self )
+{
+    if (TEST_BYTE_SWAPPED(self))
+      return Qfalse;
+    return Qtrue;
+}
+
+
+/*
+  Returns view of narray with inplace flagged.
+  @return [Numo::NArray] view of narray with inplace flag.
+*/
+VALUE na_inplace( VALUE self )
+{
+    VALUE view = self;
+    view = na_make_view(self);
+    SET_INPLACE(view);
+    return view;
+}
+
+/*
+  Set inplace flag to self.
+  @return [Numo::NArray] self
+*/
+VALUE na_inplace_bang( VALUE self )
+{
+    SET_INPLACE(self);
+    return self;
+}
+
+VALUE na_inplace_store( VALUE self, VALUE val )
+{
+    if (self==val)
+        return self;
+    else
+        return na_store( self, val );
+}
+
+/*
+  Return true if inplace flagged.
+*/
+VALUE na_inplace_p( VALUE self )
+{
+    if (TEST_INPLACE(self))
+        return Qtrue;
+    else
+        return Qfalse;
+}
+
+/*
+  Unset inplace flag to self.
+  @return [Numo::NArray] self
+*/
+VALUE na_out_of_place_bang( VALUE self )
+{
+    UNSET_INPLACE(self);
+    return self;
+}
+
+int na_debug_flag=0;
+
+static VALUE na_debug_set(VALUE mod, VALUE flag)
+{
+    na_debug_flag = RTEST(flag);
+    return Qnil;
+}
+
+static double na_profile_value=0;
+
+static VALUE na_profile(VALUE mod)
+{
+    return rb_float_new(na_profile_value);
+}
+
+static VALUE na_profile_set(VALUE mod, VALUE val)
+{
+    na_profile_value = NUM2DBL(val);
+    return val;
+}
+
+
+/*
+  Returns the number of rows used for NArray#inspect
+  @overload inspect_rows
+  @return [Integer or nil]  the number of rows.
+*/
+static VALUE na_inspect_rows(VALUE mod)
+{
+    if (numo_na_inspect_rows > 0) {
+        return INT2NUM(numo_na_inspect_rows);
+    } else {
+        return Qnil;
+    }
+}
+
+/*
+  Set the number of rows used for NArray#inspect
+  @overload inspect_rows=(rows)
+  @param [Integer or nil] rows  the number of rows
+  @return [nil]
+*/
+static VALUE na_inspect_rows_set(VALUE mod, VALUE num)
+{
+    if (RTEST(num)) {
+        numo_na_inspect_rows = NUM2INT(num);
+    } else {
+        numo_na_inspect_rows = 0;
+    }
+    return Qnil;
+}
+
+/*
+  Returns the number of cols used for NArray#inspect
+  @overload inspect_cols
+  @return [Integer or nil]  the number of cols.
+*/
+static VALUE na_inspect_cols(VALUE mod)
+{
+    if (numo_na_inspect_cols > 0) {
+        return INT2NUM(numo_na_inspect_cols);
+    } else {
+        return Qnil;
+    }
+}
+
+/*
+  Set the number of cols used for NArray#inspect
+  @overload inspect_cols=(cols)
+  @param [Integer or nil] cols  the number of cols
+  @return [nil]
+*/
+static VALUE na_inspect_cols_set(VALUE mod, VALUE num)
+{
+    if (RTEST(num)) {
+        numo_na_inspect_cols = NUM2INT(num);
+    } else {
+        numo_na_inspect_cols = 0;
+    }
+    return Qnil;
+}
+
+
+/*
+  Equality of self and other in view of numerical array.
+  i.e., both arrays have same shape and corresponding elements are equal.
+  @overload == other
+  @param [Object] other
+  @return [Boolean] true if self and other is equal.
+*/
+VALUE
+na_equal(VALUE self, volatile VALUE other)
+{
+    volatile VALUE vbool;
+    narray_t *na1, *na2;
+    int i;
+
+    GetNArray(self,na1);
+
+    if (!rb_obj_is_kind_of(other,cNArray)) {
+        other = rb_funcall(CLASS_OF(self), id_cast, 1, other);
+    }
+
+    GetNArray(other,na2);
+    if (na1->ndim != na2->ndim) {
+        return Qfalse;
+    }
+    for (i=0; i<na1->ndim; i++) {
+        if (na1->shape[i] != na2->shape[i]) {
+            return Qfalse;
+        }
+    }
+    vbool = rb_funcall(self, id_eq, 1, other);
+    return (rb_funcall(vbool, id_count_false, 0)==INT2FIX(0)) ? Qtrue : Qfalse;
+}
+
+
+
+/* initialization of NArray Class */
+void
+Init_narray()
+{
+    mNumo = rb_define_module("Numo");
+
+    /*
+      Document-class: Numo::NArray
+
+      Numo::NArray is the abstract super class for
+      Numerical N-dimensional Array in the Ruby/Numo module.
+      Use Typed Subclasses of NArray (Numo::DFloat, Int32, etc)
+      to create data array instances.
+    */
+    cNArray = rb_define_class_under(mNumo, "NArray", rb_cObject);
+
+#ifndef HAVE_RB_CCOMPLEX
+    rb_require("complex");
+    rb_cComplex = rb_const_get(rb_cObject, rb_intern("Complex"));
+#endif
+
+    rb_define_const(cNArray, "VERSION", rb_str_new2(NARRAY_VERSION));
+
+    nary_eCastError = rb_define_class_under(cNArray, "CastError", rb_eStandardError);
+    nary_eShapeError = rb_define_class_under(cNArray, "ShapeError", rb_eStandardError);
+    nary_eOperationError = rb_define_class_under(cNArray, "OperationError", rb_eStandardError);
+    nary_eDimensionError = rb_define_class_under(cNArray, "DimensionError", rb_eStandardError);
+
+    rb_define_singleton_method(cNArray, "debug=", na_debug_set, 1);
+    rb_define_singleton_method(cNArray, "profile", na_profile, 0);
+    rb_define_singleton_method(cNArray, "profile=", na_profile_set, 1);
+
+    rb_define_singleton_method(cNArray, "inspect_rows", na_inspect_rows, 0);
+    rb_define_singleton_method(cNArray, "inspect_rows=", na_inspect_rows_set, 1);
+    rb_define_singleton_method(cNArray, "inspect_cols", na_inspect_cols, 0);
+    rb_define_singleton_method(cNArray, "inspect_cols=", na_inspect_cols_set, 1);
+
+    /* Ruby allocation framework  */
+    rb_undef_alloc_func(cNArray);
+    rb_define_method(cNArray, "initialize", na_initialize, -2);
+    rb_define_method(cNArray, "initialize_copy", na_initialize_copy, 1);
+
+    rb_define_singleton_method(cNArray, "zeros", na_s_zeros, -1);
+    rb_define_singleton_method(cNArray, "ones", na_s_ones, -1);
+    rb_define_singleton_method(cNArray, "linspace", na_s_linspace, -1);
+    rb_define_singleton_method(cNArray, "logspace", na_s_logspace, -1);
+    rb_define_singleton_method(cNArray, "eye", na_s_eye, -1);
+
+    rb_define_method(cNArray, "size", na_size, 0);
+    rb_define_alias (cNArray, "length","size");
+    rb_define_alias (cNArray, "total","size");
+    rb_define_method(cNArray, "shape", na_shape, 0);
+    rb_define_method(cNArray, "ndim", na_ndim,0);
+    rb_define_alias (cNArray, "rank","ndim");
+    rb_define_method(cNArray, "empty?", na_empty_p, 0);
+
+    rb_define_method(cNArray, "debug_info", nary_debug_info, 0);
+
+    rb_define_method(cNArray, "contiguous?", na_check_contiguous, 0);
+
+    rb_define_method(cNArray, "view", na_make_view, 0);
+    rb_define_method(cNArray, "expand_dims", na_expand_dims, 1);
+    rb_define_method(cNArray, "reverse", nary_reverse, -1);
+
+    rb_define_singleton_method(cNArray, "upcast", numo_na_upcast, 1);
+    rb_define_singleton_method(cNArray, "byte_size", nary_s_byte_size, 0);
+
+    rb_define_singleton_method(cNArray, "from_binary", nary_s_from_binary, -1);
+    rb_define_alias (rb_singleton_class(cNArray), "from_string", "from_binary");
+    rb_define_method(cNArray, "store_binary",  nary_store_binary, -1);
+    rb_define_method(cNArray, "to_binary",  nary_to_binary, 0);
+    rb_define_alias (cNArray, "to_string", "to_binary");
+    rb_define_method(cNArray, "marshal_dump",  nary_marshal_dump, 0);
+    rb_define_method(cNArray, "marshal_load",  nary_marshal_load, 1);
+
+    rb_define_method(cNArray, "byte_size",  nary_byte_size, 0);
+
+    rb_define_method(cNArray, "cast_to", nary_cast_to, 1);
+
+    rb_define_method(cNArray, "coerce", nary_coerce, 1);
+
+    rb_define_method(cNArray, "column_major?", na_column_major_p, 0);
+    rb_define_method(cNArray, "row_major?", na_row_major_p, 0);
+    rb_define_method(cNArray, "byte_swapped?", na_byte_swapped_p, 0);
+    rb_define_method(cNArray, "host_order?", na_host_order_p, 0);
+
+    rb_define_method(cNArray, "inplace", na_inplace, 0);
+    rb_define_method(cNArray, "inplace?", na_inplace_p, 0);
+    rb_define_method(cNArray, "inplace!", na_inplace_bang, 0);
+    rb_define_method(cNArray, "out_of_place!", na_out_of_place_bang, 0);
+    rb_define_alias (cNArray, "not_inplace!", "out_of_place!");
+
+    rb_define_method(cNArray, "==", na_equal, 1);
+
+    id_allocate = rb_intern("allocate");
+    id_contiguous_stride = rb_intern(CONTIGUOUS_STRIDE);
+    //id_element_bit_size = rb_intern(ELEMENT_BIT_SIZE);
+    id_element_byte_size = rb_intern(ELEMENT_BYTE_SIZE);
+
+    id_fill        = rb_intern("fill");
+    id_seq         = rb_intern("seq");
+    id_logseq      = rb_intern("logseq");
+    id_eye         = rb_intern("eye");
+    id_UPCAST      = rb_intern("UPCAST");
+    id_cast        = rb_intern("cast");
+    id_dup         = rb_intern("dup");
+    id_to_host     = rb_intern("to_host");
+    id_bracket     = rb_intern("[]");
+    id_shift_left  = rb_intern("<<");
+    id_eq          = rb_intern("eq");
+    id_count_false = rb_intern("count_false");
+    id_axis        = rb_intern("axis");
+    id_nan         = rb_intern("nan");
+    id_keepdims    = rb_intern("keepdims");
+
+    sym_reduce   = ID2SYM(rb_intern("reduce"));
+    sym_option   = ID2SYM(rb_intern("option"));
+    sym_loop_opt = ID2SYM(rb_intern("loop_opt"));
+    sym_init     = ID2SYM(rb_intern("init"));
+
+    Init_nary_step();
+    Init_nary_index();
+
+    Init_nary_data();
+    Init_nary_ndloop();
+
+    Init_numo_dcomplex();
+    Init_numo_dfloat();
+    Init_numo_scomplex();
+    Init_numo_sfloat();
+
+    Init_numo_int64();
+    Init_numo_uint64();
+    Init_numo_int32();
+    Init_numo_uint32();
+    Init_numo_int16();
+    Init_numo_uint16();
+    Init_numo_int8();
+    Init_numo_uint8();
+
+    Init_numo_bit();
+    Init_numo_robject();
+
+    Init_nary_math();
+
+    Init_nary_rand();
+    Init_nary_array();
+    Init_nary_struct();
+}
diff --git a/ext/numo/narray/ndloop.c b/ext/numo/narray/ndloop.c
new file mode 100644
index 0000000..e431a7f
--- /dev/null
+++ b/ext/numo/narray/ndloop.c
@@ -0,0 +1,1961 @@
+/*
+  ndloop.c
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+
+#include <ruby.h>
+#include "numo/narray.h"
+
+#if 0
+#define DBG(x) x
+#else
+#define DBG(x)
+#endif
+
+#ifdef HAVE_STDARG_PROTOTYPES
+#include <stdarg.h>
+#define va_init_list(a,b) va_start(a,b)
+#else
+#include <varargs.h>
+#define va_init_list(a,b) va_start(a)
+#endif
+
+typedef struct NA_BUFFER_COPY {
+    int ndim;
+    size_t elmsz;
+    size_t *n;
+    char *src_ptr;
+    char *buf_ptr;
+    na_loop_iter_t *src_iter;
+    na_loop_iter_t *buf_iter;
+} na_buffer_copy_t;
+
+typedef struct NA_LOOP_XARGS {
+    na_loop_iter_t *iter;     // moved from na_loop_t
+    na_buffer_copy_t *bufcp;  // copy data to buffer
+    int flag;                 // NDL_READ NDL_WRITE
+    bool free_user_iter;   // alloc LARG(lp,j).iter=lp->xargs[j].iter
+} na_loop_xargs_t;
+
+typedef struct NA_MD_LOOP {
+    int  narg;
+    int  nin;
+    int  ndim;                // n of total dimention
+    unsigned int copy_flag;   // set i-th bit if i-th arg is cast
+    size_t  *n_ptr;           // memory for n
+    na_loop_iter_t *iter_ptr; // memory for iter
+    size_t  *n;               // n of elements for each dim
+    na_loop_t  user;          // loop in user function
+    na_loop_xargs_t *xargs;   // extra data for each arg
+    int    writeback;         // write back result to i-th arg
+    int    init_aidx;         // index of initializer argument
+    int    reduce_dim;
+    int   *trans_map;
+    VALUE  vargs;
+    VALUE  reduce;
+    VALUE  loop_opt;
+    ndfunc_t  *ndfunc;
+    void (*loop_func)();
+} na_md_loop_t;
+
+#define LARG(lp,iarg) ((lp)->user.args[iarg])
+#define LITER(lp,idim,iarg) ((lp)->xargs[iarg].iter[idim])
+#define LITER_SRC(lp,idim) ((lp)->src_iter[idim])
+#define LBUFCP(lp,j) ((lp)->xargs[j].bufcp)
+
+#define CASTABLE(t) (RTEST(t) && (t)!=OVERWRITE)
+
+#define NDL_READ 1
+#define NDL_WRITE 2
+#define NDL_READ_WRITE (NDL_READ|NDL_WRITE)
+
+static ID id_cast;
+static ID id_extract;
+
+static inline VALUE
+nary_type_s_cast(VALUE type, VALUE obj)
+{
+    return rb_funcall(type,id_cast,1,obj);
+}
+
+static void
+print_ndfunc(ndfunc_t *nf) {
+    volatile VALUE t;
+    int i, k;
+    printf("ndfunc_t = 0x%"SZF"x {\n",(size_t)nf);
+    printf("  func  = 0x%"SZF"x\n", (size_t)nf->func);
+    printf("  flag  = 0x%"SZF"x\n", (size_t)nf->flag);
+    printf("  nin   = %d\n", nf->nin);
+    printf("  nout  = %d\n", nf->nout);
+    printf("  ain   = 0x%"SZF"x\n", (size_t)nf->ain);
+    for (i=0; i<nf->nin; i++) {
+        t = rb_inspect(nf->ain[i].type);
+        printf("  ain[%d].type = %s\n", i, StringValuePtr(t));
+        printf("  ain[%d].dim = %d\n", i, nf->ain[i].dim);
+    }
+    printf("  aout  = 0x%"SZF"x\n", (size_t)nf->aout);
+    for (i=0; i<nf->nout; i++) {
+        t = rb_inspect(nf->aout[i].type);
+        printf("  aout[%d].type = %s\n", i, StringValuePtr(t));
+        printf("  aout[%d].dim = %d\n", i, nf->aout[i].dim);
+        for (k=0; k<nf->aout[i].dim; k++) {
+            printf("  aout[%d].shape[%d] = %"SZF"u\n", i, k, nf->aout[i].shape[k]);
+        }
+    }
+    printf("}\n");
+}
+
+
+static void
+print_ndloop(na_md_loop_t *lp) {
+    int i,j,nd;
+    printf("na_md_loop_t = 0x%"SZF"x {\n",(size_t)lp);
+    printf("  narg = %d\n", lp->narg);
+    printf("  nin  = %d\n", lp->nin);
+    printf("  ndim = %d\n", lp->ndim);
+    printf("  copy_flag = %x\n", lp->copy_flag);
+    printf("  writeback = %d\n", lp->writeback);
+    printf("  init_aidx = %d\n", lp->init_aidx);
+    printf("  reduce_dim = %d\n", lp->reduce_dim);
+    printf("  trans_map = 0x%"SZF"x\n", (size_t)lp->trans_map);
+    nd = lp->ndim + lp->user.ndim;
+    for (i=0; i<nd; i++) {
+        printf("  trans_map[%d] = %d\n", i, lp->trans_map[i]);
+    }
+    printf("  n = 0x%"SZF"x\n", (size_t)lp->n);
+    nd = lp->ndim + lp->user.ndim;
+    for (i=0; i<=lp->ndim; i++) {
+        printf("  n[%d] = %"SZF"u\n", i, lp->n[i]);
+    }
+    printf("  user.n = 0x%"SZF"x\n", (size_t)lp->user.n);
+    if (lp->user.n) {
+        for (i=0; i<=lp->user.ndim; i++) {
+            printf("  user.n[%d] = %"SZF"u\n", i, lp->user.n[i]);
+        }
+    }
+    printf("  xargs = 0x%"SZF"x\n", (size_t)lp->xargs);
+    printf("  iter_ptr = 0x%"SZF"x\n", (size_t)lp->iter_ptr);
+    printf("  user.narg = %d\n", lp->user.narg);
+    printf("  user.ndim = %d\n", lp->user.ndim);
+    printf("  user.args = 0x%"SZF"x\n", (size_t)lp->user.args);
+    for (j=0; j<lp->narg; j++) {
+    }
+    printf("  user.opt_ptr = 0x%"SZF"x\n", (size_t)lp->user.opt_ptr);
+    if (lp->reduce==Qnil) {
+        printf("  reduce  = nil\n");
+    } else {
+        printf("  reduce  = 0x%x\n", NUM2INT(lp->reduce));
+    }
+    for (j=0; j<lp->narg; j++) {
+        printf("--user.args[%d]--\n", j);
+        printf("  user.args[%d].ptr = 0x%"SZF"x\n", j, (size_t)LARG(lp,j).ptr);
+        printf("  user.args[%d].elmsz = %"SZF"d\n", j, LARG(lp,j).elmsz);
+        printf("  user.args[%d].value = 0x%"PRI_VALUE_PREFIX"x\n", j, LARG(lp,j).value);
+        printf("  user.args[%d].ndim = %d\n", j, LARG(lp,j).ndim);
+        printf("  user.args[%d].shape = 0x%"SZF"x\n", j, (size_t)LARG(lp,j).shape);
+        if (LARG(lp,j).shape) {
+            for (i=0; i<LARG(lp,j).ndim; i++) {
+                printf("  user.args[%d].shape[%d] = %"SZF"d\n", j, i, LARG(lp,j).shape[i]);
+            }
+        }
+        printf("  user.args[%d].iter = 0x%"SZF"x\n", j,(size_t)lp->user.args[j].iter);
+        if (lp->user.args[j].iter) {
+            for (i=0; i<lp->user.ndim; i++) {
+                printf(" &user.args[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&lp->user.args[j].iter[i]);
+                printf("  user.args[%d].iter[%d].pos = %"SZF"u\n", j,i, lp->user.args[j].iter[i].pos);
+                printf("  user.args[%d].iter[%d].step = %"SZF"u\n", j,i, lp->user.args[j].iter[i].step);
+                printf("  user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
+            }
+        }
+        //
+        printf("  xargs[%d].flag = %d\n", j, lp->xargs[j].flag);
+        printf("  xargs[%d].free_user_iter = %d\n", j, lp->xargs[j].free_user_iter);
+        for (i=0; i<=nd; i++) {
+            printf(" &xargs[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&LITER(lp,i,j));
+            printf("  xargs[%d].iter[%d].pos = %"SZF"u\n", j,i, LITER(lp,i,j).pos);
+            printf("  xargs[%d].iter[%d].step = %"SZF"u\n", j,i, LITER(lp,i,j).step);
+            printf("  xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
+        }
+        printf("  xargs[%d].bufcp = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp);
+        if (lp->xargs[j].bufcp) {
+            printf("  xargs[%d].bufcp->ndim = %d\n", j, lp->xargs[j].bufcp->ndim);
+            printf("  xargs[%d].bufcp->elmsz = %"SZF"d\n", j, lp->xargs[j].bufcp->elmsz);
+            printf("  xargs[%d].bufcp->n = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->n);
+            printf("  xargs[%d].bufcp->src_ptr = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->src_ptr);
+            printf("  xargs[%d].bufcp->buf_ptr = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->buf_ptr);
+            printf("  xargs[%d].bufcp->src_iter = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->src_iter);
+            printf("  xargs[%d].bufcp->buf_iter = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->buf_iter);
+        }
+    }
+    printf("}\n");
+}
+
+
+static unsigned int
+ndloop_func_loop_spec(ndfunc_t *nf, int user_ndim)
+{
+    unsigned int f=0;
+    // If user function supports LOOP
+    if (user_ndim > 0 || NDF_TEST(nf,NDF_HAS_LOOP)) {
+        if (!NDF_TEST(nf,NDF_STRIDE_LOOP)) {
+            f |= 1;
+        }
+        if (!NDF_TEST(nf,NDF_INDEX_LOOP)) {
+            f |= 2;
+        }
+    }
+    return f;
+}
+
+
+
+
+static int
+ndloop_cast_required(VALUE type, VALUE value)
+{
+    return CASTABLE(type) && type != CLASS_OF(value);
+}
+
+static int
+ndloop_castable_type(VALUE type)
+{
+    return rb_obj_is_kind_of(type, rb_cClass) && RTEST(rb_class_inherited_p(type, cNArray));
+}
+
+static void
+ndloop_cast_error(VALUE type, VALUE value)
+{
+    VALUE x = rb_inspect(type);
+    char* s = StringValueCStr(x);
+    rb_bug("fail cast from %s to %s", rb_obj_classname(value),s);
+    rb_raise(rb_eTypeError,"fail cast from %s to %s",
+             rb_obj_classname(value), s);
+}
+
+// convert input argeuments given by RARRAY_PTR(args)[j]
+//              to type specified by nf->args[j].type
+// returns copy_flag where nth-bit is set if nth argument is converted.
+static unsigned int
+ndloop_cast_args(ndfunc_t *nf, VALUE args)
+{
+    int j;
+    unsigned int copy_flag=0;
+    VALUE type, value;
+
+    for (j=0; j<nf->nin; j++) {
+
+        type = nf->ain[j].type;
+        if (TYPE(type)==T_SYMBOL)
+            continue;
+        value = RARRAY_AREF(args,j);
+        if (!ndloop_cast_required(type, value))
+            continue;
+
+        if (ndloop_castable_type(type)) {
+            RARRAY_ASET(args,j,nary_type_s_cast(type, value));
+            copy_flag |= 1<<j;
+        } else {
+            ndloop_cast_error(type, value);
+        }
+    }
+
+    RB_GC_GUARD(type); RB_GC_GUARD(value);
+    return copy_flag;
+}
+
+
+static void
+ndloop_handle_symbol_in_ain(VALUE type, VALUE value, int at, na_md_loop_t *lp)
+{
+    if (type==sym_reduce) {
+        lp->reduce = value;
+    }
+    else if (type==sym_option) {
+        lp->user.option = value;
+    }
+    else if (type==sym_loop_opt) {
+        lp->loop_opt = value;
+    }
+    else if (type==sym_init) {
+        lp->init_aidx = at;
+    }
+    else {
+        rb_bug("ndloop parse_options: unknown type");
+    }
+}
+
+static inline int
+max2(int x, int y)
+{
+    return x > y ? x : y;
+}
+
+static void
+ndloop_find_max_dimension(na_md_loop_t *lp, ndfunc_t *nf, VALUE args)
+{
+    int j;
+    int nin=0; // number of input objects (except for symbols)
+    int user_nd=0; // max dimension of user function
+    int loop_nd=0; // max dimension of md-loop
+
+    for (j=0; j<RARRAY_LEN(args); j++) {
+        VALUE t = nf->ain[j].type;
+        VALUE v = RARRAY_AREF(args,j);
+        if (TYPE(t)==T_SYMBOL) {
+            ndloop_handle_symbol_in_ain(t, v, j, lp);
+        } else {
+            nin++;
+            user_nd = max2(user_nd, nf->ain[j].dim);
+            if (IsNArray(v))
+                loop_nd = max2(loop_nd, RNARRAY_NDIM(v) - nf->ain[j].dim);
+        }
+    }
+
+    lp->narg = lp->user.narg = nin + nf->nout;
+    lp->nin = nin;
+    lp->ndim = loop_nd;
+    lp->user.ndim = user_nd;
+}
+
+/*
+  user-dimension:
+    user_nd = MAX( nf->args[j].dim )
+
+  user-support dimension:
+
+  loop dimension:
+    loop_nd
+*/
+
+static void
+ndloop_alloc(na_md_loop_t *lp, ndfunc_t *nf, VALUE args,
+             void *opt_ptr, unsigned int copy_flag,
+             void (*loop_func)(ndfunc_t*, na_md_loop_t*))
+{
+    int i,j;
+    int narg;
+    int max_nd;
+
+    long args_len;
+
+    na_loop_iter_t *iter;
+
+    int trans_dim;
+    unsigned int f;
+
+    args_len = RARRAY_LEN(args);
+
+    if (args_len != nf->nin) {
+        rb_bug("wrong number of arguments for ndfunc (%lu for %d)",
+               args_len, nf->nin);
+    }
+
+    lp->vargs = args;
+    lp->ndfunc = nf;
+    lp->loop_func = loop_func;
+    lp->copy_flag = copy_flag;
+
+    lp->reduce = Qnil;
+    lp->user.option = Qnil;
+    lp->user.opt_ptr = opt_ptr;
+    lp->user.err_type = Qfalse;
+    lp->loop_opt = Qnil;
+    lp->writeback = -1;
+    lp->init_aidx = -1;
+
+    lp->n = NULL;
+    lp->n_ptr = NULL;
+    lp->xargs = NULL;
+    lp->user.args = NULL;
+    lp->user.n = NULL;
+    lp->iter_ptr = NULL;
+    lp->trans_map = NULL;
+
+    ndloop_find_max_dimension(lp, nf, args);
+    narg = lp->nin + nf->nout;
+    max_nd = lp->ndim + lp->user.ndim;
+
+    lp->n    = lp->n_ptr = ALLOC_N(size_t, max_nd+1);
+    lp->xargs = ALLOC_N(na_loop_xargs_t, narg);
+    lp->user.args = ALLOC_N(na_loop_args_t, narg);
+    iter = ALLOC_N(na_loop_iter_t, narg*(max_nd+1));
+    lp->iter_ptr = iter;
+
+    for (j=0; j<narg; j++) {
+        LARG(lp,j).value = Qnil;
+        LARG(lp,j).iter = NULL;
+        LARG(lp,j).shape = NULL;
+        LARG(lp,j).ndim = 0;
+        lp->xargs[j].iter = &(iter[(max_nd+1)*j]);
+        lp->xargs[j].bufcp = NULL;
+        lp->xargs[j].flag = (j<nf->nin) ? NDL_READ : NDL_WRITE;
+        lp->xargs[j].free_user_iter = 0;
+    }
+
+    for (i=0; i<=max_nd; i++) {
+        lp->n[i] = 1;
+        for (j=0; j<narg; j++) {
+            LITER(lp,i,j).pos = 0;
+            LITER(lp,i,j).step = 0;
+            LITER(lp,i,j).idx = NULL;
+        }
+    }
+
+    // transpose reduce-dimensions to last dimensions
+    //              array          loop
+    //           [*,+,*,+,*] => [*,*,*,+,+]
+    // trans_map=[0,3,1,4,2] <= [0,1,2,3,4]
+    lp->trans_map = ALLOC_N(int, max_nd+1);
+    if (NDF_TEST(nf,NDF_FLAT_REDUCE) && RTEST(lp->reduce)) {
+        trans_dim = 0;
+        for (i=0; i<max_nd; i++) {
+            if (na_test_reduce(lp->reduce, i)) {
+                lp->trans_map[i] = -1;
+            } else {
+                lp->trans_map[i] = trans_dim++;
+            }
+        }
+        j = trans_dim;
+        for (i=0; i<max_nd; i++) {
+            if (lp->trans_map[i] == -1) {
+                lp->trans_map[i] = j++;
+            }
+        }
+        lp->reduce_dim = max_nd - trans_dim;
+        f = 0;
+        for (i=trans_dim; i<max_nd; i++) {
+            f |= 1<<i;
+        }
+        lp->reduce = INT2FIX(f);
+    } else {
+        for (i=0; i<max_nd; i++) {
+            lp->trans_map[i] = i;
+        }
+        lp->reduce_dim = 0;
+    }
+}
+
+
+static VALUE
+ndloop_release(VALUE vlp)
+{
+    int j;
+    VALUE v;
+    na_md_loop_t *lp = (na_md_loop_t*)(vlp);
+
+    for (j=0; j < lp->narg; j++) {
+        v = LARG(lp,j).value;
+        if (IsNArray(v)) {
+            na_release_lock(v);
+        }
+    }
+    //xfree(lp);
+    for (j=0; j<lp->narg; j++) {
+        //printf("lp->xargs[%d].bufcp=%lx\n",j,(size_t)(lp->xargs[j].bufcp));
+        if (lp->xargs[j].bufcp) {
+            xfree(lp->xargs[j].bufcp->buf_iter);
+            xfree(lp->xargs[j].bufcp->buf_ptr);
+            xfree(lp->xargs[j].bufcp->n);
+            xfree(lp->xargs[j].bufcp);
+            if (lp->xargs[j].free_user_iter) {
+                xfree(LARG(lp,j).iter);
+            }
+        }
+    }
+    if (lp->trans_map) xfree(lp->trans_map);
+    xfree(lp->xargs);
+    xfree(lp->iter_ptr);
+    xfree(lp->user.args);
+    xfree(lp->n_ptr);
+    //rb_gc_force_recycle(vlp);
+    return Qnil;
+}
+
+
+/*
+  set lp->n[i] (shape of n-d iteration) here
+*/
+static void
+ndloop_check_shape(na_md_loop_t *lp, int nf_dim, narray_t *na)
+{
+    int i, k;
+    size_t n;
+    int dim_beg;
+
+    dim_beg = lp->ndim + nf_dim - na->ndim;
+
+    for (k = na->ndim - nf_dim - 1; k>=0; k--) {
+        i = lp->trans_map[k + dim_beg];
+        n = na->shape[k];
+        // if n==1 then repeat this dimension
+        if (n != 1) {
+            if (lp->n[i] == 1) {
+                lp->n[i] = n;
+            } else if (lp->n[i] != n) {
+                // inconsistent array shape
+                rb_raise(nary_eShapeError,"shape1[%d](=%"SZF"u) != shape2[%d](=%"SZF"u)",
+                         i, lp->n[i], k, n);
+            }
+        }
+    }
+}
+
+
+/*
+na->shape[i] == lp->n[ dim_map[i] ]
+ */
+static void
+ndloop_set_stepidx(na_md_loop_t *lp, int j, VALUE vna, int *dim_map, int rwflag)
+{
+    size_t n, s;
+    int i, k;
+    stridx_t sdx;
+    narray_t *na;
+
+    LARG(lp,j).value = vna;
+    LARG(lp,j).elmsz = nary_element_stride(vna);
+    if (rwflag == NDL_READ) {
+        LARG(lp,j).ptr = na_get_pointer_for_read(vna);
+    } else
+    if (rwflag == NDL_WRITE) {
+        LARG(lp,j).ptr = na_get_pointer_for_write(vna);
+    } else
+    if (rwflag == NDL_READ_WRITE) {
+        LARG(lp,j).ptr = na_get_pointer_for_read_write(vna);
+    } else {
+        rb_bug("invalid value for read-write flag");
+    }
+    GetNArray(vna,na);
+
+    switch(NA_TYPE(na)) {
+    case NARRAY_DATA_T:
+        if (NA_DATA_PTR(na)==NULL && NA_SIZE(na)>0) {
+            rb_bug("cannot read no-data NArray");
+            rb_raise(rb_eRuntimeError,"cannot read no-data NArray");
+        }
+        // through
+    case NARRAY_FILEMAP_T:
+        s = LARG(lp,j).elmsz;
+        for (k=na->ndim; k--;) {
+            n = na->shape[k];
+            if (n > 1) {
+                i = dim_map[k];
+                //printf("n=%d k=%d i=%d\n",n,k,i);
+                LITER(lp,i,j).step = s;
+                LITER(lp,i,j).idx = NULL;
+            }
+            s *= n;
+        }
+        LITER(lp,0,j).pos = 0;
+        break;
+    case NARRAY_VIEW_T:
+        LITER(lp,0,j).pos = NA_VIEW_OFFSET(na);
+        for (k=0; k<na->ndim; k++) {
+            n = na->shape[k];
+            sdx = NA_VIEW_STRIDX(na)[k];
+            if (n > 1) {
+                i = dim_map[k];
+                if (SDX_IS_INDEX(sdx)) {
+                    LITER(lp,i,j).step = 0;
+                    LITER(lp,i,j).idx = SDX_GET_INDEX(sdx);
+                } else {
+                    LITER(lp,i,j).step = SDX_GET_STRIDE(sdx);
+                    LITER(lp,i,j).idx = NULL;
+                }
+            } else if (n==1) {
+                if (SDX_IS_INDEX(sdx)) {
+                    LITER(lp,0,j).pos += SDX_GET_INDEX(sdx)[0];
+                }
+            }
+        }
+        break;
+    default:
+        rb_bug("invalid narray internal type");
+    }
+}
+
+
+
+static void
+ndloop_init_args(ndfunc_t *nf, na_md_loop_t *lp, VALUE args)
+{
+    int i, j;
+    VALUE v;
+    narray_t *na;
+    int nf_dim;
+    int dim_beg;
+    int *dim_map;
+    int max_nd = lp->ndim + lp->user.ndim;
+    int flag;
+    size_t s;
+
+/*
+na->shape[i] == lp->n[ dim_map[i] ]
+ */
+    dim_map = ALLOCA_N(int, max_nd);
+
+    // input arguments
+    for (j=0; j<nf->nin; j++) {
+        if (TYPE(nf->ain[j].type)==T_SYMBOL) {
+            continue;
+        }
+        v = RARRAY_AREF(args,j);
+        if (IsNArray(v)) {
+            // set LARG(lp,j) with v
+            GetNArray(v,na);
+            nf_dim = nf->ain[j].dim;
+            if (nf_dim > na->ndim) {
+                rb_raise(nary_eDimensionError,"requires >= %d-dimensioal array "
+                         "while %d-dimensional array is given",nf_dim,na->ndim);
+            }
+            ndloop_check_shape(lp, nf_dim, na);
+            dim_beg = lp->ndim + nf->ain[j].dim - na->ndim;
+            for (i=0; i<na->ndim; i++) {
+                dim_map[i] = lp->trans_map[i+dim_beg];
+                //printf("dim_map[%d]=%d na->shape[%d]=%d\n",i,dim_map[i],i,na->shape[i]);
+            }
+            if (nf->ain[j].type==OVERWRITE) {
+                lp->xargs[j].flag = flag = NDL_WRITE;
+            } else {
+                lp->xargs[j].flag = flag = NDL_READ;
+            }
+            ndloop_set_stepidx(lp, j, v, dim_map, flag);
+            LARG(lp,j).ndim = nf_dim;
+            if (nf_dim > 0) {
+                LARG(lp,j).shape = na->shape + (na->ndim - nf_dim);
+            }
+        } else if (TYPE(v)==T_ARRAY) {
+            LARG(lp,j).value = v;
+            LARG(lp,j).elmsz = sizeof(VALUE);
+            LARG(lp,j).ptr   = NULL;
+            for (i=0; i<=max_nd; i++) {
+                LITER(lp,i,j).step = 1;
+            }
+        }
+    }
+    // check whether # of element is zero
+    for (s=1,i=0; i<=max_nd; i++) {
+        s *= lp->n[i];
+    }
+    if (s==0) {
+        for (i=0; i<=max_nd; i++) {
+            lp->n[i] = 0;
+        }
+    }
+}
+
+
+static int
+ndloop_check_inplace(VALUE type, int na_ndim, size_t *na_shape, VALUE v)
+{
+    int i;
+    narray_t *na;
+
+    // type check
+    if (type != CLASS_OF(v)) {
+        return 0;
+    }
+    GetNArray(v,na);
+    // shape check
+    if (na->ndim != na_ndim) {
+        return 0;
+    }
+    for (i=0; i<na_ndim; i++) {
+        if (na_shape[i] != na->shape[i]) {
+            return 0;
+        }
+    }
+    // v is selected as output
+    return 1;
+}
+
+static VALUE
+ndloop_find_inplace(ndfunc_t *nf, na_md_loop_t *lp, VALUE type,
+                    int na_ndim, size_t *na_shape, VALUE args)
+{
+    int j;
+    VALUE v;
+
+    // find inplace
+    for (j=0; j<nf->nin; j++) {
+        v = RARRAY_AREF(args,j);
+        if (IsNArray(v)) {
+            if (TEST_INPLACE(v)) {
+                if (ndloop_check_inplace(type,na_ndim,na_shape,v)) {
+                    // if already copied, create outary and write-back
+                    if (lp->copy_flag & (1<<j)) {
+                        lp->writeback = j;
+                    }
+                    return v;
+                }
+            }
+        }
+    }
+    // find casted or copied input array
+    for (j=0; j<nf->nin; j++) {
+        if (lp->copy_flag & (1<<j)) {
+            v = RARRAY_AREF(args,j);
+            if (ndloop_check_inplace(type,na_ndim,na_shape,v)) {
+                return v;
+            }
+        }
+    }
+    return Qnil;
+}
+
+
+
+static VALUE
+ndloop_get_arg_type(ndfunc_t *nf, VALUE args, VALUE t)
+{
+    int i;
+
+    // if type is FIXNUM, get the type of i-th argument
+    if (FIXNUM_P(t)) {
+        i = FIX2INT(t);
+        if (i<0 || i>=nf->nin) {
+            rb_bug("invalid type: index (%d) out of # of args",i);
+        }
+        t = nf->ain[i].type;
+        // if i-th type is Qnil, get the type of i-th input value
+        if (!CASTABLE(t)) {
+            t = CLASS_OF(RARRAY_AREF(args,i));
+        }
+    }
+    return t;
+}
+
+
+static VALUE
+ndloop_set_output_narray(ndfunc_t *nf, na_md_loop_t *lp, int k,
+                         VALUE type, VALUE args)
+{
+    int i, j;
+    int na_ndim;
+    int lp_dim;
+    volatile VALUE v=Qnil;
+    size_t *na_shape;
+    int *dim_map;
+    int flag = NDL_READ_WRITE;
+    int nd;
+    int max_nd = lp->ndim + nf->aout[k].dim;
+
+    na_shape = ALLOCA_N(size_t, max_nd);
+    dim_map = ALLOCA_N(int, max_nd);
+
+    //printf("max_nd=%d lp->ndim=%d\n",max_nd,lp->ndim);
+
+    // md-loop shape
+    na_ndim = 0;
+    for (i=0; i<lp->ndim; i++) {
+        // na_shape[i] == lp->n[lp->trans_map[i]]
+        lp_dim = lp->trans_map[i];
+        //printf("i=%d lp_dim=%d\n",i,lp_dim);
+        if (NDF_TEST(nf,NDF_CUM)) {   // cumulate with shape kept
+            na_shape[na_ndim] = lp->n[lp_dim];
+        } else
+        if (na_test_reduce(lp->reduce,lp_dim)) {   // accumulate dimension
+            if (NDF_TEST(nf,NDF_KEEP_DIM)) {
+                na_shape[na_ndim] = 1;         // leave it
+            } else {
+                continue;  // delete dimension
+            }
+        } else {
+            na_shape[na_ndim] = lp->n[lp_dim];
+        }
+        //printf("i=%d lp_dim=%d na_shape[%d]=%ld\n",i,lp_dim,i,na_shape[i]);
+        dim_map[na_ndim++] = lp_dim;
+        //dim_map[lp_dim] = na_ndim++;
+    }
+
+    // user-specified shape
+    for (i=0; i<nf->aout[k].dim; i++) {
+        na_shape[na_ndim] = nf->aout[k].shape[i];
+        dim_map[na_ndim++] = i + lp->ndim;
+    }
+
+    // find inplace from input arrays
+    if (k==0 && NDF_TEST(nf,NDF_INPLACE)) {
+        v = ndloop_find_inplace(nf,lp,type,na_ndim,na_shape,args);
+    }
+    if (!RTEST(v)) {
+        // new object
+        v = nary_new(type, na_ndim, na_shape);
+        flag = NDL_WRITE;
+    }
+
+    j = lp->nin + k;
+    ndloop_set_stepidx(lp, j, v, dim_map, flag);
+    LARG(lp,j).ndim = nd = nf->aout[k].dim;
+    if (nd > 0) {
+        LARG(lp,j).shape = nf->aout[k].shape;
+    }
+
+    return v;
+}
+
+static VALUE
+ndloop_set_output(ndfunc_t *nf, na_md_loop_t *lp, VALUE args)
+{
+    int i, j, k, idx;
+    volatile VALUE v, t, results;
+    VALUE init;
+
+    int max_nd = lp->ndim + lp->user.ndim;
+
+    // output results
+    results = rb_ary_new2(nf->nout);
+
+    for (k=0; k<nf->nout; k++) {
+        t = nf->aout[k].type;
+        t = ndloop_get_arg_type(nf,args,t);
+
+        if (rb_obj_is_kind_of(t, rb_cClass)) {
+            if (RTEST(rb_class_inherited_p(t, cNArray))) {
+                // NArray
+                v = ndloop_set_output_narray(nf,lp,k,t,args);
+                rb_ary_push(results, v);
+            }
+            else if (RTEST(rb_class_inherited_p(t, rb_cArray))) {
+                // Ruby Array
+                j = lp->nin + k;
+                for (i=0; i<=max_nd; i++) {
+                    LITER(lp,i,j).step = sizeof(VALUE);
+                }
+                LARG(lp,j).value = t;
+                LARG(lp,j).elmsz = sizeof(VALUE);
+            } else {
+                rb_raise(rb_eRuntimeError,"ndloop_set_output: invalid for type");
+            }
+        }
+    }
+
+    // initialilzer
+    k = lp->init_aidx;
+    if (k > -1) {
+        idx = nf->ain[k].dim;
+        v = RARRAY_AREF(results,idx);
+        init = RARRAY_AREF(args,k);
+        na_store(v,init);
+    }
+
+    return results;
+}
+
+
+static void
+ndfunc_contract_loop(na_md_loop_t *lp)
+{
+    int i,j,k,success,cnt=0;
+    int red0, redi;
+
+    redi = na_test_reduce(lp->reduce,0);
+
+    //for (i=0; i<lp->ndim; i++) {
+    //    printf("lp->n[%d]=%lu\n",i,lp->n[i]);
+    //}
+
+    for (i=1; i<lp->ndim; i++) {
+        red0 = redi;
+        redi = na_test_reduce(lp->reduce,i);
+        //printf("contract i=%d reduce_cond=%d %d\n",i,red0,redi);
+        if (red0 != redi) {
+            continue;
+        }
+        success = 1;
+        for (j=0; j<lp->narg; j++) {
+            if (!(LITER(lp,i,j).idx == NULL &&
+                  LITER(lp,i-1,j).idx == NULL &&
+                  LITER(lp,i-1,j).step == LITER(lp,i,j).step*(ssize_t)(lp->n[i]))) {
+                success = 0;
+                break;
+            }
+        }
+        if (success) {
+            //printf("contract i=%d-th and %d-th, lp->n[%d]=%"SZF"d, lp->n[%d]=%"SZF"d\n",
+            //       i-1,i, i,lp->n[i], i-1,lp->n[i-1]);
+            // contract (i-1)-th and i-th dimension
+            lp->n[i] *= lp->n[i-1];
+            // shift dimensions
+            for (k=i-1; k>cnt; k--) {
+                lp->n[k] = lp->n[k-1];
+            }
+            //printf("k=%d\n",k);
+            for (; k>=0; k--) {
+                lp->n[k] = 1;
+            }
+            for (j=0; j<lp->narg; j++) {
+                for (k=i-1; k>cnt; k--) {
+                    LITER(lp,k,j) = LITER(lp,k-1,j);
+                }
+            }
+            if (redi) {
+                lp->reduce_dim--;
+            }
+            cnt++;
+        }
+    }
+    //printf("contract cnt=%d\n",cnt);
+    if (cnt>0) {
+        for (j=0; j<lp->narg; j++) {
+            LITER(lp,cnt,j).pos = LITER(lp,0,j).pos;
+            lp->xargs[j].iter = &LITER(lp,cnt,j);
+        }
+        lp->n = &(lp->n[cnt]);
+        lp->ndim -= cnt;
+        //for (i=0; i<lp->ndim; i++) {printf("lp->n[%d]=%lu\n",i,lp->n[i]);}
+    }
+}
+
+
+static void
+ndfunc_set_user_loop(ndfunc_t *nf, na_md_loop_t *lp)
+{
+    int j, ud=0;
+
+    if (lp->reduce_dim > 0) {
+        ud = lp->reduce_dim;
+    }
+    else if (lp->ndim > 0 && NDF_TEST(nf,NDF_HAS_LOOP)) {
+        ud = 1;
+    }
+    else {
+        goto skip_ud;
+    }
+    if (ud > lp->ndim) {
+        rb_bug("Reduce-dimension is larger than loop-dimension");
+    }
+    // increase user dimension
+    lp->user.ndim += ud;
+    lp->ndim -= ud;
+    for (j=0; j<lp->narg; j++) {
+        if (LARG(lp,j).shape) {
+            rb_bug("HAS_LOOP or reduce-dimension=%d conflicts with user-dimension",lp->reduce_dim);
+        }
+        LARG(lp,j).ndim += ud;
+        LARG(lp,j).shape = &(lp->n[lp->ndim]);
+        //printf("LARG(lp,j).ndim=%d,LARG(lp,j).shape=%lx\n",LARG(lp,j).ndim,(size_t)LARG(lp,j).shape);
+    }
+    //printf("lp->reduce_dim=%d lp->user.ndim=%d lp->ndim=%d\n",lp->reduce_dim,lp->user.ndim,lp->ndim);
+
+ skip_ud:
+    lp->user.n = &(lp->n[lp->ndim]);
+    for (j=0; j<lp->narg; j++) {
+        LARG(lp,j).iter = &LITER(lp,lp->ndim,j);
+        //printf("in ndfunc_set_user_loop: lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
+    }
+}
+
+
+static void
+ndfunc_set_bufcp(na_md_loop_t *lp, unsigned int loop_spec)
+{
+    unsigned int f;
+    int i, j;
+    int nd, ndim;
+    bool zero_step;
+    ssize_t n, sz, elmsz, stride, n_total; //, last_step;
+    size_t *buf_shape;
+    na_loop_iter_t *buf_iter=NULL, *src_iter;
+
+    //if (loop_spec==0) return;
+
+    n_total = lp->user.n[0];
+    for (i=1; i<lp->user.ndim; i++) {
+        n_total *= lp->user.n[i];
+    }
+
+    //for (j=0; j<lp->nin; j++) {
+    for (j=0; j<lp->narg; j++) {
+        //ndim = nd = lp->user.ndim;
+        ndim = nd = LARG(lp,j).ndim;
+        sz = elmsz = LARG(lp,j).elmsz;
+        src_iter = LARG(lp,j).iter;
+        //last_step = src_iter[ndim-1].step;
+        f = 0;
+        zero_step = 1;
+        for (i=ndim; i>0; ) {
+            i--;
+            if (LARG(lp,j).shape) {
+                n = LARG(lp,j).shape[i];
+            } else {
+                printf("shape is NULL\n");
+                n = lp->user.n[i];
+            }
+            stride = sz * n;
+            //printf("{j=%d,i=%d,ndim=%d,nd=%d,idx=%lx,step=%ld,n=%ld,sz=%ld,stride=%ld}\n",j,i,ndim,nd,(size_t)src_iter[i].idx,src_iter[i].step,n,sz,stride);
+            if (src_iter[i].idx) {
+                f |= 2;  // INDEX LOOP
+                zero_step = 0;
+            } else {
+                if (src_iter[i].step != sz) {
+                    f |= 1;  // NON_CONTIGUOUS LOOP
+                } else {
+                    // CONTIGUOUS LOOP
+                    if (i==ndim-1) {  // contract if last dimension
+                        ndim = i;
+                        elmsz = stride;
+                    }
+                }
+                if (src_iter[i].step != 0) {
+                    zero_step = 0;
+                }
+            }
+            sz = stride;
+        }
+        //printf("[j=%d f=%d loop_spec=%d zero_step=%d]\n",j,f,loop_spec,zero_step);
+
+        if (zero_step) {
+            // no buffer needed
+            continue;
+        }
+
+        // should check flatten-able loop to avoid buffering
+
+
+        // over loop_spec or reduce_loop is not contiguous
+        if (f & loop_spec || (lp->reduce_dim > 1 && ndim > 0)) {
+            //printf("(buf,nd=%d)",nd);
+            buf_iter = ALLOC_N(na_loop_iter_t,nd+3);
+            buf_shape = ALLOC_N(size_t,nd);
+            buf_iter[nd].pos = 0;
+            buf_iter[nd].step = 0;
+            buf_iter[nd].idx = NULL;
+            sz = LARG(lp,j).elmsz;
+            //last_step = sz;
+            for (i=nd; i>0; ) {
+                i--;
+                buf_iter[i].pos = 0;
+                buf_iter[i].step = sz;
+                buf_iter[i].idx = NULL;
+                //n = lp->user.n[i];
+                n = LARG(lp,j).shape[i];
+                buf_shape[i] = n;
+                sz *= n;
+            }
+            LBUFCP(lp,j) = ALLOC(na_buffer_copy_t);
+            LBUFCP(lp,j)->ndim = ndim;
+            LBUFCP(lp,j)->elmsz = elmsz;
+            LBUFCP(lp,j)->n = buf_shape;
+            LBUFCP(lp,j)->src_iter = src_iter;
+            LBUFCP(lp,j)->buf_iter = buf_iter;
+            LARG(lp,j).iter = buf_iter;
+            //printf("in ndfunc_set_bufcp(1): lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
+            LBUFCP(lp,j)->src_ptr = LARG(lp,j).ptr;
+            LARG(lp,j).ptr = LBUFCP(lp,j)->buf_ptr = xmalloc(sz);
+            //printf("(LBUFCP(lp,%d)->buf_ptr=%lx)\n",j,(size_t)(LBUFCP(lp,j)->buf_ptr));
+        }
+    }
+
+#if 0
+    for (j=0; j<lp->narg; j++) {
+        ndim = lp->user.ndim;
+        src_iter = LARG(lp,j).iter;
+        last_step = src_iter[ndim-1].step;
+        if (lp->reduce_dim>1) {
+            //printf("(reduce_dim=%d,ndim=%d,nd=%d,n=%ld,lst=%ld)\n",lp->reduce_dim,ndim,nd,n_total,last_step);
+            buf_iter = ALLOC_N(na_loop_iter_t,2);
+            buf_iter[0].pos = LARG(lp,j).iter[0].pos;
+            buf_iter[0].step = last_step;
+            buf_iter[0].idx = NULL;
+            buf_iter[1].pos = 0;
+            buf_iter[1].step = 0;
+            buf_iter[1].idx = NULL;
+            LARG(lp,j).iter = buf_iter;
+            //printf("in ndfunc_set_bufcp(2): lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
+            lp->xargs[j].free_user_iter = 1;
+        }
+    }
+#endif
+
+    // flatten reduce dimensions
+    if (lp->reduce_dim > 1) {
+#if 1
+        for (j=0; j<lp->narg; j++) {
+            ndim = lp->user.ndim;
+            LARG(lp,j).iter[0].step = LARG(lp,j).iter[ndim-1].step;
+            LARG(lp,j).iter[0].idx = NULL;
+        }
+#endif
+        lp->user.n[0] = n_total;
+        lp->user.ndim = 1;
+    }
+}
+
+
+static void
+ndloop_copy_to_buffer(na_buffer_copy_t *lp)
+{
+    size_t *c;
+    char *src, *buf;
+    int  i;
+    int  nd = lp->ndim;
+    size_t elmsz = lp->elmsz;
+    size_t buf_pos = 0;
+    DBG(size_t j);
+
+    //printf("\nto_buf nd=%d elmsz=%ld\n",nd,elmsz);
+    DBG(printf("<to buf> ["));
+    // zero-dimension
+    if (nd==0) {
+        src = lp->src_ptr + LITER_SRC(lp,0).pos;
+        buf = lp->buf_ptr;
+        memcpy(buf,src,elmsz);
+        DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(buf))[j]);});
+        goto loop_end;
+    }
+    // initialize loop counter
+    c = ALLOCA_N(size_t, nd+1);
+    for (i=0; i<=nd; i++) c[i]=0;
+    // loop body
+    for (i=0;;) {
+        // i-th dimension
+        for (; i<nd; i++) {
+            if (LITER_SRC(lp,i).idx) {
+                LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).idx[c[i]];
+            } else {
+                LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).step*c[i];
+            }
+        }
+        src = lp->src_ptr + LITER_SRC(lp,nd).pos;
+        buf = lp->buf_ptr + buf_pos;
+        memcpy(buf,src,elmsz);
+        DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(buf))[j]);});
+        buf_pos += elmsz;
+        // count up
+        for (;;) {
+            if (i<=0) goto loop_end;
+            i--;
+            if (++c[i] < lp->n[i]) break;
+            c[i] = 0;
+        }
+    }
+ loop_end:
+    ;
+    DBG(printf("]\n"));
+}
+
+static void
+ndloop_copy_from_buffer(na_buffer_copy_t *lp)
+{
+    size_t *c;
+    char *src, *buf;
+    int  i;
+    int  nd = lp->ndim;
+    size_t elmsz = lp->elmsz;
+    size_t buf_pos = 0;
+    DBG(size_t j);
+
+    //printf("\nfrom_buf nd=%d elmsz=%ld\n",nd,elmsz);
+    DBG(printf("<from buf> ["));
+    // zero-dimension
+    if (nd==0) {
+        src = lp->src_ptr + LITER_SRC(lp,0).pos;
+        buf = lp->buf_ptr;
+        memcpy(src,buf,elmsz);
+        DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(src))[j]);});
+        goto loop_end;
+    }
+    // initialize loop counter
+    c = ALLOCA_N(size_t, nd+1);
+    for (i=0; i<=nd; i++) c[i]=0;
+    // loop body
+    for (i=0;;) {
+        // i-th dimension
+        for (; i<nd; i++) {
+            if (LITER_SRC(lp,i).idx) {
+                LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).idx[c[i]];
+            } else {
+                LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).step*c[i];
+            }
+        }
+        src = lp->src_ptr + LITER_SRC(lp,nd).pos;
+        buf = lp->buf_ptr + buf_pos;
+        memcpy(src,buf,elmsz);
+        DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(src))[j]);});
+        buf_pos += elmsz;
+        // count up
+        for (;;) {
+            if (i<=0) goto loop_end;
+            i--;
+            if (++c[i] < lp->n[i]) break;
+            c[i] = 0;
+        }
+    }
+ loop_end:
+    DBG(printf("]\n"));
+}
+
+
+static void
+ndfunc_write_back(ndfunc_t *nf, na_md_loop_t *lp, VALUE orig_args, VALUE results)
+{
+    VALUE src, dst;
+
+    if (lp->writeback >= 0) {
+        dst = RARRAY_AREF(orig_args,lp->writeback);
+        src = RARRAY_AREF(results,0);
+        na_store(dst,src);
+        RARRAY_ASET(results,0,dst);
+    }
+}
+
+
+static VALUE
+ndloop_extract(VALUE results, ndfunc_t *nf)
+{
+    long n, i;
+    VALUE x, y;
+    narray_t *na;
+
+    // extract result objects
+    switch(nf->nout) {
+    case 0:
+        return Qnil;
+    case 1:
+        x = RARRAY_AREF(results,0);
+        if (NDF_TEST(nf,NDF_EXTRACT)) {
+            if (IsNArray(x)){
+                GetNArray(x,na);
+                if (NA_NDIM(na)==0) {
+                    x = rb_funcall(x, id_extract, 0);
+                }
+            }
+        }
+        return x;
+    }
+    if (NDF_TEST(nf,NDF_EXTRACT)) {
+        n = RARRAY_LEN(results);
+        for (i=0; i<n; i++) {
+            x = RARRAY_AREF(results,i);
+            if (IsNArray(x)){
+                GetNArray(x,na);
+                if (NA_NDIM(na)==0) {
+                    y = rb_funcall(x, id_extract, 0);
+                    RARRAY_ASET(results,i,y);
+                }
+            }
+        }
+    }
+    return results;
+}
+
+
+static void
+loop_narray(ndfunc_t *nf, na_md_loop_t *lp);
+
+static VALUE
+ndloop_run(VALUE vlp)
+{
+    unsigned int loop_spec;
+    volatile VALUE args, orig_args, results;
+    na_md_loop_t *lp = (na_md_loop_t*)(vlp);
+    ndfunc_t *nf;
+
+    orig_args = lp->vargs;
+    nf = lp->ndfunc;
+
+    args = rb_obj_dup(orig_args);
+
+    // setup ndloop iterator with arguments
+    ndloop_init_args(nf, lp, args);
+    results = ndloop_set_output(nf, lp, args);
+
+    //if (na_debug_flag) {
+    //    printf("-- ndloop_set_output --\n");
+    //    print_ndloop(lp);
+    //}
+
+    // contract loop
+    if (lp->loop_func == loop_narray) {
+        ndfunc_contract_loop(lp);
+        //if (na_debug_flag) {
+        //    printf("-- ndfunc_contract_loop --\n");
+        //    print_ndloop(lp);
+        //}
+    }
+
+    // setup objects in which resuts are stored
+    ndfunc_set_user_loop(nf, lp);
+
+    // setup buffering during loop
+    if (lp->loop_func == loop_narray) {
+        loop_spec = ndloop_func_loop_spec(nf, lp->user.ndim);
+        ndfunc_set_bufcp(lp, loop_spec);
+        if (na_debug_flag) {
+            printf("-- ndfunc_set_bufcp --\n");
+            print_ndloop(lp);
+        }
+    }
+
+    // loop
+    (*(lp->loop_func))(nf, lp);
+
+    //if (na_debug_flag) {
+    //    printf("-- after loop --\n");
+    //    print_ndloop(lp);
+    //}
+
+    if (RTEST(lp->user.err_type)) {
+        rb_raise(lp->user.err_type, "error in NArray operation");
+    }
+
+    // write-back will be placed here
+    ndfunc_write_back(nf, lp, orig_args, results);
+
+    // extract result objects
+    return ndloop_extract(results, nf);
+}
+
+
+// ---------------------------------------------------------------------------
+
+static void
+loop_narray(ndfunc_t *nf, na_md_loop_t *lp)
+{
+    size_t *c;
+    int  i, j;
+    int  nd = lp->ndim;
+
+    if (nd<0) {
+        rb_bug("bug? lp->ndim = %d\n", lp->ndim);
+    }
+
+    if (nd==0) {
+        for (j=0; j<lp->nin; j++) {
+            if (lp->xargs[j].bufcp) {
+                //printf("copy_to_buffer j=%d\n",j);
+                ndloop_copy_to_buffer(lp->xargs[j].bufcp);
+            }
+        }
+        (*(nf->func))(&(lp->user));
+        for (j=0; j<lp->narg; j++) {
+            if (lp->xargs[j].bufcp && (lp->xargs[j].flag & NDL_WRITE)) {
+                //printf("copy_from_buffer j=%d\n",j);
+                // copy data to work buffer
+                ndloop_copy_from_buffer(lp->xargs[j].bufcp);
+            }
+        }
+        return;
+    }
+
+    // initialize loop counter
+    c = ALLOCA_N(size_t, nd+1);
+    for (i=0; i<=nd; i++) c[i]=0;
+
+    // loop body
+    for (i=0;;) {
+        // i-th dimension
+        for (; i<nd; i++) {
+            // j-th argument
+            for (j=0; j<lp->narg; j++) {
+                if (LITER(lp,i,j).idx) {
+                    LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
+                } else {
+                    LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
+                }
+                //printf("j=%d c[i=%d]=%lu pos=%lu\n",j,i,c[i],LITER(lp,i+1,j).pos);
+            }
+        }
+        for (j=0; j<lp->nin; j++) {
+            if (lp->xargs[j].bufcp) {
+                // copy data to work buffer
+                // cp lp->iter[j][nd..*] to lp->user.args[j].iter[0..*]
+                //printf("copy_to_buffer j=%d\n",j);
+                ndloop_copy_to_buffer(lp->xargs[j].bufcp);
+            }
+        }
+        (*(nf->func))(&(lp->user));
+        for (j=0; j<lp->narg; j++) {
+            if (lp->xargs[j].bufcp && (lp->xargs[j].flag & NDL_WRITE)) {
+                // copy data to work buffer
+                //printf("copy_from_buffer j=%d\n",j);
+                ndloop_copy_from_buffer(lp->xargs[j].bufcp);
+            }
+        }
+        if (RTEST(lp->user.err_type)) {return;}
+
+        for (;;) {
+            if (i<=0) goto loop_end;
+            i--;
+            if (++c[i] < lp->n[i]) break;
+            c[i] = 0;
+        }
+    }
+ loop_end:
+    ;
+}
+
+
+VALUE
+na_ndloop_main(ndfunc_t *nf, VALUE args, void *opt_ptr)
+{
+    unsigned int copy_flag;
+    na_md_loop_t lp;
+
+    if (na_debug_flag) print_ndfunc(nf);
+
+    // cast arguments to NArray
+    copy_flag = ndloop_cast_args(nf, args);
+
+    // allocate ndloop struct
+    ndloop_alloc(&lp, nf, args, opt_ptr, copy_flag, loop_narray);
+
+    return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
+}
+
+
+VALUE
+#ifdef HAVE_STDARG_PROTOTYPES
+na_ndloop(ndfunc_t *nf, int argc, ...)
+#else
+na_ndloop(nf, argc, va_alist)
+  ndfunc_t *nf;
+  int argc;
+  va_dcl
+#endif
+{
+    va_list ar;
+
+    int i;
+    VALUE *argv;
+    volatile VALUE args;
+
+    argv = ALLOCA_N(VALUE,argc);
+
+    va_init_list(ar, argc);
+    for (i=0; i<argc; i++) {
+        argv[i] = va_arg(ar, VALUE);
+    }
+    va_end(ar);
+
+    args = rb_ary_new4(argc, argv);
+
+    return na_ndloop_main(nf, args, NULL);
+}
+
+
+VALUE
+na_ndloop2(ndfunc_t *nf, VALUE args)
+{
+    return na_ndloop_main(nf, args, NULL);
+}
+
+VALUE
+#ifdef HAVE_STDARG_PROTOTYPES
+na_ndloop3(ndfunc_t *nf, void *ptr, int argc, ...)
+#else
+na_ndloop3(nf, ptr, argc, va_alist)
+  ndfunc_t *nf;
+  void *ptr;
+  int argc;
+  va_dcl
+#endif
+{
+    va_list ar;
+
+    int i;
+    VALUE *argv;
+    volatile VALUE args;
+
+    argv = ALLOCA_N(VALUE,argc);
+
+    va_init_list(ar, argc);
+    for (i=0; i<argc; i++) {
+        argv[i] = va_arg(ar, VALUE);
+    }
+    va_end(ar);
+
+    args = rb_ary_new4(argc, argv);
+
+    return na_ndloop_main(nf, args, ptr);
+}
+
+VALUE
+na_ndloop4(ndfunc_t *nf, void *ptr, VALUE args)
+{
+    return na_ndloop_main(nf, args, ptr);
+}
+
+//----------------------------------------------------------------------
+
+VALUE
+na_info_str(VALUE ary)
+{
+    int nd, i;
+    char tmp[32];
+    VALUE buf;
+    narray_t *na;
+
+    GetNArray(ary,na);
+    nd = na->ndim;
+
+    buf = rb_str_new2(rb_class2name(CLASS_OF(ary)));
+    if (NA_TYPE(na) == NARRAY_VIEW_T) {
+        rb_str_cat(buf,"(view)",6);
+    }
+    rb_str_cat(buf,"#shape=[",8);
+    if (nd>0) {
+        for (i=0;;) {
+            sprintf(tmp,"%"SZF"u",na->shape[i]);
+            rb_str_cat2(buf,tmp);
+            if (++i==nd) break;
+            rb_str_cat(buf,",",1);
+        }
+    }
+    rb_str_cat(buf,"]",1);
+    return buf;
+}
+
+
+//----------------------------------------------------------------------
+
+#define ncol numo_na_inspect_cols
+#define nrow numo_na_inspect_rows
+extern int ncol, nrow;
+
+static void
+loop_inspect(ndfunc_t *nf, na_md_loop_t *lp)
+{
+    int nd, i, ii;
+    size_t *c;
+    int col=0, row=0;
+    long len;
+    VALUE str;
+    na_text_func_t func = (na_text_func_t)(nf->func);
+    VALUE buf, opt;
+
+    nd = lp->ndim;
+    buf = lp->loop_opt;
+    //opt = *(VALUE*)(lp->user.opt_ptr);
+    opt = lp->user.option;
+
+    for (i=0; i<nd; i++) {
+        if (lp->n[i] == 0) {
+            rb_str_cat(buf,"[]",2);
+            return;
+        }
+    }
+
+    rb_str_cat(buf,"\n",1);
+
+    c = ALLOCA_N(size_t, nd+1);
+    for (i=0; i<=nd; i++) c[i]=0;
+
+    if (nd>0) {
+        rb_str_cat(buf,"[",1);
+    } else {
+        rb_str_cat(buf,"",0);
+    }
+
+    col = nd*2;
+    for (i=0;;) {
+        if (i<nd-1) {
+            for (ii=0; ii<i; ii++) rb_str_cat(buf," ",1);
+            for (; ii<nd-1; ii++) rb_str_cat(buf,"[",1);
+        }
+        for (; i<nd; i++) {
+            if (LITER(lp,i,0).idx) {
+                LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
+            } else {
+                LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
+            }
+        }
+        str = (*func)(LARG(lp,0).ptr, LITER(lp,i,0).pos, opt);
+
+        len = RSTRING_LEN(str) + 2;
+        if (ncol>0 && col+len > ncol-3) {
+            rb_str_cat(buf,"...",3);
+            c[i-1] = lp->n[i-1];
+        } else {
+            rb_str_append(buf, str);
+            col += len;
+        }
+        for (;;) {
+            if (i==0) goto loop_end;
+            i--;
+            if (++c[i] < lp->n[i]) break;
+            rb_str_cat(buf,"]",1);
+            c[i] = 0;
+        }
+        //line_break:
+        rb_str_cat(buf,", ",2);
+        if (i<nd-1) {
+            rb_str_cat(buf,"\n ",2);
+            col = nd*2;
+            row++;
+            if (row==nrow) {
+                rb_str_cat(buf,"...",3);
+                goto loop_end;
+            }
+        }
+    }
+ loop_end:
+    ;
+}
+
+
+VALUE
+na_ndloop_inspect(VALUE nary, na_text_func_t func, VALUE opt)
+{
+    volatile VALUE args;
+    na_md_loop_t lp;
+    VALUE buf;
+    ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
+    ndfunc_t nf = { (na_iter_func_t)func, NO_LOOP, 3, 0, ain, 0 };
+    //nf = ndfunc_alloc(NULL, NO_LOOP, 1, 0, Qnil);
+
+    buf = na_info_str(nary);
+
+    if (na_get_pointer(nary)==NULL) {
+        return rb_str_cat(buf,"(empty)",7);
+    }
+
+    //rb_p(args);
+    //if (na_debug_flag) print_ndfunc(&nf);
+
+    args = rb_ary_new3(3,nary,buf,opt);
+
+    // cast arguments to NArray
+    //ndloop_cast_args(nf, args);
+
+    // allocate ndloop struct
+    ndloop_alloc(&lp, &nf, args, NULL, 0, loop_inspect);
+
+    rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
+
+    return buf;
+}
+
+
+//----------------------------------------------------------------------
+
+static void
+loop_store_subnarray(ndfunc_t *nf, na_md_loop_t *lp, int i0, size_t *c, VALUE a)
+{
+    int nd = lp->ndim;
+    int i, j;
+    narray_t *na;
+    int *dim_map;
+    VALUE a_type;
+
+    a_type = CLASS_OF(LARG(lp,0).value);
+    if (CLASS_OF(a) != a_type) {
+        a = rb_funcall(a_type, id_cast, 1, a);
+    }
+    GetNArray(a,na);
+    if (na->ndim != nd-i0+1) {
+        rb_raise(nary_eShapeError, "mismatched dimension of sub-narray: "
+                 "nd_src=%d, nd_dst=%d", na->ndim, nd-i0+1);
+    }
+    dim_map = ALLOCA_N(int, na->ndim);
+    for (i=0; i<na->ndim; i++) {
+        dim_map[i] = lp->trans_map[i+i0];
+        //printf("dim_map[i=%d] = %d, i0=%d\n", i, dim_map[i], i0);
+    }
+    ndloop_set_stepidx(lp, 1, a, dim_map, NDL_READ);
+    LARG(lp,1).shape = &(na->shape[na->ndim-1]);
+
+    // loop body
+    for (i=i0;;) {
+        LARG(lp,1).value = Qtrue;
+        for (; i<nd; i++) {
+            for (j=0; j<2; j++) {
+                if (LITER(lp,i,j).idx) {
+                    LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
+                } else {
+                    LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
+                }
+            }
+            if (c[i] >= na->shape[i-i0]) {
+                LARG(lp,1).value = Qfalse;
+            }
+        }
+
+        (*(nf->func))(&(lp->user));
+
+        for (;;) {
+            if (i<=i0) goto loop_end;
+            i--; c[i]++;
+            if (c[i] < lp->n[i]) break;
+            c[i] = 0;
+        }
+    }
+ loop_end:
+    LARG(lp,1).ptr = NULL;
+}
+
+
+static void
+loop_store_rarray(ndfunc_t *nf, na_md_loop_t *lp)
+{
+    size_t *c;
+    int     i;
+    VALUE  *a;
+    int nd = lp->ndim;
+
+    // counter
+    c = ALLOCA_N(size_t, nd+1);
+    for (i=0; i<=nd; i++) c[i]=0;
+
+    // array at each dimension
+    a = ALLOCA_N(VALUE, nd+1);
+    a[0] = LARG(lp,1).value;
+
+    //print_ndloop(lp);
+
+    // loop body
+    for (i=0;;) {
+        for (; i<nd; i++) {
+            if (LITER(lp,i,0).idx) {
+                LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
+            } else {
+                LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
+            }
+            if (TYPE(a[i])==T_ARRAY) {
+                if (c[i] < (size_t)RARRAY_LEN(a[i])) {
+                    a[i+1] = RARRAY_AREF(a[i],c[i]);
+                } else {
+                    a[i+1] = Qnil;
+                }
+            } else if (IsNArray(a[i])) {
+                //printf("a[i=%d]=0x%lx\n",i,a[i]);
+                loop_store_subnarray(nf,lp,i,c,a[i]);
+                goto loop_next;
+            } else {
+                if (c[i]==0) {
+                    a[i+1] = a[i];
+                } else {
+                    a[i+1] = Qnil;
+                }
+            }
+            //printf("c[%d]=%lu\n",i,c[i]);
+        }
+
+        //printf("a[i=%d]=0x%lx\n",i,a[i]);
+        if (IsNArray(a[i])) {
+            loop_store_subnarray(nf,lp,i,c,a[i]);
+        } else {
+            LARG(lp,1).value = a[i];
+            (*(nf->func))(&(lp->user));
+        }
+
+    loop_next:
+        for (;;) {
+            if (i<=0) goto loop_end;
+            i--; c[i]++;
+            if (c[i] < lp->n[i]) break;
+            c[i] = 0;
+        }
+    }
+ loop_end:
+    ;
+}
+
+VALUE
+na_ndloop_store_rarray(ndfunc_t *nf, VALUE nary, VALUE rary)
+{
+    na_md_loop_t lp;
+    VALUE args;
+
+    //rb_p(args);
+    if (na_debug_flag) print_ndfunc(nf);
+
+    args = rb_assoc_new(nary,rary);
+
+    // cast arguments to NArray
+    //ndloop_cast_args(nf, args);
+
+    // allocate ndloop struct
+    ndloop_alloc(&lp, nf, args, NULL, 0, loop_store_rarray);
+
+    return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
+}
+
+
+VALUE
+na_ndloop_store_rarray2(ndfunc_t *nf, VALUE nary, VALUE rary, VALUE opt)
+{
+    na_md_loop_t lp;
+    VALUE args;
+
+    //rb_p(args);
+    if (na_debug_flag) print_ndfunc(nf);
+
+    //args = rb_assoc_new(rary,nary);
+    args = rb_ary_new3(3,nary,rary,opt);
+
+    // cast arguments to NArray
+    //ndloop_cast_args(nf, args);
+
+    // allocate ndloop struct
+    ndloop_alloc(&lp, nf, args, NULL, 0, loop_store_rarray);
+
+    return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
+}
+
+
+//----------------------------------------------------------------------
+
+static void
+loop_narray_to_rarray(ndfunc_t *nf, na_md_loop_t *lp)
+{
+    size_t *c;
+    int i;
+    //int nargs = nf->narg + nf->nres;
+    int nd = lp->ndim;
+    VALUE *a;
+    volatile VALUE a0;
+
+    // alloc counter
+    c = ALLOCA_N(size_t, nd+1);
+    for (i=0; i<=nd; i++) c[i]=0;
+    //c[i]=1; // for zero-dim
+    //fprintf(stderr,"in loop_narray_to_rarray, nd=%d\n",nd);
+
+    a = ALLOCA_N(VALUE, nd+1);
+    a[0] = a0 = lp->loop_opt;
+
+    // loop body
+    for (i=0;;) {
+        for (; i<nd; i++) {
+            if (LITER(lp,i,0).idx) {
+                LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
+            } else {
+                LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
+            }
+            if (c[i]==0) {
+                a[i+1] = rb_ary_new2(lp->n[i]);
+                rb_ary_push(a[i],a[i+1]);
+            }
+        }
+
+        //lp->user.info = a[i];
+        LARG(lp,1).value = a[i];
+        (*(nf->func))(&(lp->user));
+
+        for (;;) {
+            if (i<=0) goto loop_end;
+            i--;
+            if (++c[i] < lp->n[i]) break;
+            c[i] = 0;
+        }
+    }
+ loop_end:
+    ;
+}
+
+VALUE
+na_ndloop_cast_narray_to_rarray(ndfunc_t *nf, VALUE nary, VALUE fmt)
+{
+    na_md_loop_t lp;
+    VALUE args, a0;
+
+    //rb_p(args);
+    if (na_debug_flag) print_ndfunc(nf);
+
+    a0 = rb_ary_new();
+    args = rb_ary_new3(3,nary,a0,fmt);
+
+    // cast arguments to NArray
+    //ndloop_cast_args(nf, args);
+
+    // allocate ndloop struct
+    ndloop_alloc(&lp, nf, args, NULL, 0, loop_narray_to_rarray);
+
+    rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
+    return RARRAY_AREF(a0,0);
+}
+
+
+//----------------------------------------------------------------------
+
+static void
+loop_narray_with_index(ndfunc_t *nf, na_md_loop_t *lp)
+{
+    size_t *c;
+    int i,j;
+    int nd = lp->ndim;
+
+    // pass total ndim to iterator
+    lp->user.ndim += nd;
+
+    // alloc counter
+    lp->user.opt_ptr = c = ALLOCA_N(size_t, nd+1);
+    for (i=0; i<=nd; i++) c[i]=0;
+
+    // loop body
+    for (i=0;;) {
+        for (; i<nd; i++) {
+            // j-th argument
+            for (j=0; j<lp->narg; j++) {
+                if (LITER(lp,i,j).idx) {
+                    LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
+                } else {
+                    LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
+                }
+                //printf("j=%d c[i=%d]=%lu pos=%lu\n",j,i,c[i],LITER(lp,i+1,j).pos);
+            }
+        }
+
+        (*(nf->func))(&(lp->user));
+
+        for (;;) {
+            if (i<=0) goto loop_end;
+            i--;
+            if (++c[i] < lp->n[i]) break;
+            c[i] = 0;
+        }
+    }
+ loop_end:
+    ;
+}
+
+
+VALUE
+#ifdef HAVE_STDARG_PROTOTYPES
+na_ndloop_with_index(ndfunc_t *nf, int argc, ...)
+#else
+na_ndloop(nf, argc, va_alist)
+  ndfunc_t *nf;
+  int argc;
+  va_dcl
+#endif
+{
+    va_list ar;
+
+    int i;
+    VALUE *argv;
+    volatile VALUE args;
+    na_md_loop_t lp;
+
+    argv = ALLOCA_N(VALUE,argc);
+
+    va_init_list(ar, argc);
+    for (i=0; i<argc; i++) {
+        argv[i] = va_arg(ar, VALUE);
+    }
+    va_end(ar);
+
+    args = rb_ary_new4(argc, argv);
+
+    //return na_ndloop_main(nf, args, NULL);
+    if (na_debug_flag) print_ndfunc(nf);
+
+    // cast arguments to NArray
+    //copy_flag = ndloop_cast_args(nf, args);
+
+    // allocate ndloop struct
+    ndloop_alloc(&lp, nf, args, 0, 0, loop_narray_with_index);
+
+    return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
+}
+
+
+void
+Init_nary_ndloop()
+{
+    id_cast    = rb_intern("cast");
+    id_extract = rb_intern("extract");
+}
diff --git a/ext/numo/narray/numo/compat.h b/ext/numo/narray/numo/compat.h
new file mode 100644
index 0000000..8a6907a
--- /dev/null
+++ b/ext/numo/narray/numo/compat.h
@@ -0,0 +1,23 @@
+#ifndef COMPAT_H
+#define COMPAT_H
+
+#if !defined RSTRING_LEN
+#define RSTRING_LEN(a) RSTRING(a)->len
+#endif
+#if !defined RSTRING_PTR
+#define RSTRING_PTR(a) RSTRING(a)->ptr
+#endif
+#if !defined RARRAY_LEN
+#define RARRAY_LEN(a) RARRAY(a)->len
+#endif
+#if !defined RARRAY_PTR
+#define RARRAY_PTR(a) RARRAY(a)->ptr
+#endif
+#if !defined RARRAY_AREF
+#define RARRAY_AREF(a,i) RARRAY_PTR(a)[i]
+#endif
+#if !defined RARRAY_ASET
+#define RARRAY_ASET(a,i,v) (RARRAY_PTR(a)[i] = v)
+#endif
+
+#endif /* ifndef COMPAT_H */
diff --git a/ext/numo/narray/numo/intern.h b/ext/numo/narray/numo/intern.h
new file mode 100644
index 0000000..3bcb246
--- /dev/null
+++ b/ext/numo/narray/numo/intern.h
@@ -0,0 +1,109 @@
+/*
+  intern.h
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+#ifndef INTERN_H
+#define INTERN_H
+
+#define rb_narray_new nary_new
+VALUE nary_new(VALUE elem, int ndim, size_t *shape);
+#define rb_narray_view_new nary_view_new
+VALUE nary_view_new(VALUE elem, int ndim, size_t *shape);
+#define rb_narray_debug_info nary_debug_info
+VALUE nary_debug_info(VALUE);
+
+#define na_make_view nary_make_view
+VALUE nary_make_view(VALUE self);
+
+#define na_s_allocate nary_s_allocate
+VALUE nary_s_allocate(VALUE klass);
+#define na_s_allocate_view nary_s_allocate_view
+VALUE nary_s_allocate_view(VALUE klass);
+#define na_s_new_like nary_s_new_like
+VALUE nary_s_new_like(VALUE type, VALUE obj);
+
+void na_alloc_shape(narray_t *na, int ndim);
+void na_array_to_internal_shape(VALUE self, VALUE ary, size_t *shape);
+void na_index_arg_to_internal_order(int argc, VALUE *argv, VALUE self);
+void na_setup_shape(narray_t *na, int ndim, size_t *shape);
+
+#define na_get_elmsz nary_element_stride
+//#define na_element_stride nary_element_stride
+unsigned int nary_element_stride(VALUE nary);
+#define na_dtype_elmsz nary_dtype_element_stride
+size_t nary_dtype_element_stride(VALUE klass);
+
+#define na_get_pointer nary_get_pointer
+char *nary_get_pointer(VALUE);
+#define na_get_pointer_for_write nary_get_pointer_for_write
+char *nary_get_pointer_for_write(VALUE);
+#define na_get_pointer_for_read nary_get_pointer_for_read
+char *nary_get_pointer_for_read(VALUE);
+#define na_get_pointer_for_read_write nary_get_pointer_for_read_write
+char *nary_get_pointer_for_read_write(VALUE);
+#define na_get_offset nary_get_offset
+size_t nary_get_offset(VALUE self);
+
+#define na_copy_flags nary_copy_flags
+void nary_copy_flags(VALUE src, VALUE dst);
+
+#define na_check_ladder nary_check_ladder
+VALUE nary_check_ladder(VALUE self, int start_dim);
+#define na_check_contiguous nary_check_contiguous
+VALUE nary_check_contiguous(VALUE self);
+
+#define na_flatten_dim nary_flatten_dim
+VALUE nary_flatten_dim(VALUE self, int sd);
+
+#define na_flatten nary_flatten
+VALUE nary_flatten(VALUE);
+
+#define na_copy nary_dup
+VALUE nary_dup(VALUE);
+
+#define na_store nary_store
+VALUE nary_store(VALUE self, VALUE src);
+
+#define na_upcast numo_na_upcast
+VALUE numo_na_upcast(VALUE type1, VALUE type2);
+
+void na_release_lock(VALUE); // currently do nothing
+
+// used in reduce methods
+#define na_reduce_dimension nary_reduce_dimension
+VALUE nary_reduce_dimension(int argc, VALUE *argv, int naryc, VALUE *naryv,
+                            ndfunc_t *ndf, na_iter_func_t nan_iter);
+
+#define na_reduce_options nary_reduce_options
+VALUE nary_reduce_options(VALUE axes, VALUE *opts, int naryc, VALUE *naryv,
+                          ndfunc_t *ndf);
+
+// ndloop
+VALUE na_ndloop(ndfunc_t *nf, int argc, ...);
+VALUE na_ndloop2(ndfunc_t *nf, VALUE args);
+VALUE na_ndloop3(ndfunc_t *nf, void *ptr, int argc, ...);
+VALUE na_ndloop4(ndfunc_t *nf, void *ptr, VALUE args);
+
+VALUE na_ndloop_cast_narray_to_rarray(ndfunc_t *nf, VALUE nary, VALUE fmt);
+VALUE na_ndloop_store_rarray(ndfunc_t *nf, VALUE nary, VALUE rary);
+VALUE na_ndloop_store_rarray2(ndfunc_t *nf, VALUE nary, VALUE rary, VALUE opt);
+VALUE na_ndloop_inspect(VALUE nary, na_text_func_t func, VALUE opt);
+VALUE na_ndloop_with_index(ndfunc_t *nf, int argc, ...);
+
+#define na_info_str nary_info_str
+VALUE nary_info_str(VALUE);
+
+#define na_test_reduce nary_test_reduce
+bool nary_test_reduce(VALUE reduce, int dim);
+
+void nary_step_array_index(VALUE self, size_t ary_size, size_t *plen, ssize_t *pbeg, ssize_t *pstep);
+void nary_step_sequence(VALUE self, size_t *plen, double *pbeg, double *pstep);
+
+// used in aref, aset
+#define na_get_result_dimension nary_get_result_dimension
+int nary_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride, size_t *pos_idx);
+#define na_aref_main nary_aref_main
+VALUE nary_aref_main(int nidx, VALUE *idx, VALUE self, int keep_dim, int nd);
+
+#endif /* ifndef INTERN_H */
diff --git a/ext/numo/narray/numo/narray.h b/ext/numo/narray/numo/narray.h
new file mode 100644
index 0000000..7f7ebf2
--- /dev/null
+++ b/ext/numo/narray/numo/narray.h
@@ -0,0 +1,429 @@
+/*
+  narray.h
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+#ifndef NARRAY_H
+#define NARRAY_H
+
+#if defined(__cplusplus)
+extern "C" {
+#if 0
+} /* satisfy cc-mode */
+#endif
+#endif
+
+#define NARRAY_VERSION "0.9.0.7"
+#define NARRAY_VERSION_CODE 907
+
+#include <math.h>
+#include "numo/compat.h"
+#include "numo/template.h"
+#include "numo/extconf.h"
+
+#ifdef HAVE_STDBOOL_H
+# include <stdbool.h>
+#endif
+
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+
+#ifndef HAVE_U_INT8_T
+# ifdef HAVE_UINT8_T
+    typedef uint8_t u_int8_t;
+# endif
+#endif
+
+#ifndef HAVE_U_INT16_T
+# ifdef HAVE_UINT16_T
+    typedef uint16_t u_int16_t;
+# endif
+#endif
+
+#ifndef HAVE_U_INT32_T
+# ifdef HAVE_UINT32_T
+    typedef uint32_t u_int32_t;
+# endif
+#endif
+
+#ifndef HAVE_U_INT64_T
+# ifdef HAVE_UINT64_T
+    typedef uint64_t u_int64_t;
+# endif
+#endif
+
+#define SZF PRI_SIZE_PREFIX // defined in ruby.h
+
+#if   SIZEOF_LONG==8
+# define NUM2INT64(x) NUM2LONG(x)
+# define INT642NUM(x) LONG2NUM(x)
+# define NUM2UINT64(x) NUM2ULONG(x)
+# define UINT642NUM(x) ULONG2NUM(x)
+# ifndef PRId64
+#  define PRId64 "ld"
+# endif
+# ifndef PRIu64
+#  define PRIu64 "lu"
+# endif
+#elif SIZEOF_LONG_LONG==8
+# define NUM2INT64(x) NUM2LL(x)
+# define INT642NUM(x) LL2NUM(x)
+# define NUM2UINT64(x) NUM2ULL(x)
+# define UINT642NUM(x) ULL2NUM(x)
+# ifndef PRId64
+#  define PRId64 "lld"
+# endif
+# ifndef PRIu64
+#  define PRIu64 "llu"
+# endif
+#endif
+
+#if   SIZEOF_LONG==4
+# define NUM2INT32(x) NUM2LONG(x)
+# define INT322NUM(x) LONG2NUM(x)
+# define NUM2UINT32(x) NUM2ULONG(x)
+# define UINT322NUM(x) ULONG2NUM(x)
+# ifndef PRId32
+#  define PRId32 "ld"
+# endif
+# ifndef PRIu32
+#  define PRIu32 "lu"
+# endif
+#elif SIZEOF_INT==4
+# define NUM2INT32(x) NUM2INT(x)
+# define INT322NUM(x) INT2NUM(x)
+# define NUM2UINT32(x) NUM2UINT(x)
+# define UINT322NUM(x) UINT2NUM(x)
+# ifndef PRId32
+#  define PRId32 "d"
+# endif
+# ifndef PRIu32
+#  define PRIu32 "u"
+# endif
+#endif
+
+#ifndef HAVE_TYPE_BOOL
+  typedef int bool;
+#endif
+#ifndef FALSE                   /* in case these macros already exist */
+# define FALSE   0              /* values of bool */
+#endif
+#ifndef TRUE
+# define TRUE    1
+#endif
+
+typedef struct { float dat[2]; }  scomplex;
+typedef struct { double dat[2]; } dcomplex;
+typedef int fortran_integer;
+
+#define REAL(x) ((x).dat[0])
+#define IMAG(x) ((x).dat[1])
+
+extern int na_debug_flag;
+
+#ifndef NARRAY_C
+extern VALUE numo_cNArray;
+extern VALUE rb_mNumo;
+extern VALUE nary_eCastError;
+extern VALUE nary_eShapeError;
+extern VALUE nary_eOperationError;
+extern VALUE nary_eDimensionError;
+extern const rb_data_type_t na_data_type;
+
+//EXTERN const int na_sizeof[NA_NTYPES+1];
+#endif
+
+#define cNArray numo_cNArray
+#define mNumo rb_mNumo
+//#define na_upcast(x,y) numo_na_upcast(x,y)
+
+/* global variables within this module */
+extern VALUE numo_cBit;
+extern VALUE numo_cDFloat;
+extern VALUE numo_cSFloat;
+extern VALUE numo_cDComplex;
+extern VALUE numo_cSComplex;
+extern VALUE numo_cInt64;
+extern VALUE numo_cInt32;
+extern VALUE numo_cInt16;
+extern VALUE numo_cInt8;
+extern VALUE numo_cUInt64;
+extern VALUE numo_cUInt32;
+extern VALUE numo_cUInt16;
+extern VALUE numo_cUInt8;
+extern VALUE numo_cRObject;
+extern VALUE na_cStep;
+#ifndef HAVE_RB_CCOMPLEX
+extern VALUE rb_cComplex;
+#endif
+
+extern VALUE sym_reduce;
+extern VALUE sym_option;
+extern VALUE sym_loop_opt;
+extern VALUE sym_init;
+
+#define NARRAY_DATA_T     0x1
+#define NARRAY_VIEW_T     0x2
+#define NARRAY_FILEMAP_T  0x3
+
+typedef struct RNArray {
+    unsigned char ndim;     // # of dimensions
+    unsigned char type;
+    unsigned char flag[2];  // flags
+    unsigned short elmsz;    // element size
+    size_t   size;          // # of total elements
+    size_t  *shape;         // # of elements for each dimension
+    VALUE    reduce;
+} narray_t;
+
+
+typedef struct RNArrayData {
+    narray_t base;
+    char    *ptr;
+} narray_data_t;
+
+
+typedef union {
+    ssize_t stride;
+    size_t *index;
+} stridx_t;
+
+typedef struct RNArrayView {
+    narray_t base;
+    VALUE    data;       // data object
+    size_t   offset;     // offset of start point from data pointer
+                         // :in units of elm.unit_bits
+                         // address_unit  pointer_unit access_unit data_unit
+                         // elm.step_unit = elm.bit_size / elm.access_unit
+                         // elm.step_unit = elm.size_bits / elm.unit_bits
+    stridx_t *stridx;    // stride or indices of data pointer for each dimension
+} narray_view_t;
+
+
+// filemap is unimplemented
+typedef struct RNArrayFileMap {
+    narray_t base;
+    char    *ptr;
+#ifdef WIN32
+    HANDLE hFile;
+    HANDLE hMap;
+#else // POSIX mmap
+    int prot;
+    int flag;
+#endif
+} narray_filemap_t;
+
+
+// this will be revised in future.
+typedef struct {
+    unsigned int element_bits;
+    unsigned int element_bytes;
+    unsigned int element_stride;
+} narray_type_info_t;
+
+
+static inline narray_t *
+na_get_narray_t(VALUE obj)
+{
+    narray_t *na;
+
+    Check_TypedStruct(obj,&na_data_type);
+    na = (narray_t*)DATA_PTR(obj);
+    return na;
+}
+
+static inline narray_t *
+_na_get_narray_t(VALUE obj, unsigned char na_type)
+{
+    narray_t *na;
+
+    Check_TypedStruct(obj,&na_data_type);
+    na = (narray_t*)DATA_PTR(obj);
+    if (na->type != na_type) {
+        rb_bug("unknown type 0x%x (0x%x given)", na_type, na->type);
+    }
+    return na;
+}
+
+#define na_get_narray_data_t(obj) (narray_data_t*)_na_get_narray_t(obj,NARRAY_DATA_T)
+#define na_get_narray_view_t(obj) (narray_view_t*)_na_get_narray_t(obj,NARRAY_VIEW_T)
+#define na_get_narray_filemap_t(obj) (narray_filemap_t*)_na_get_narray_t(obj,NARRAY_FILEMAP_T)
+
+#define GetNArray(obj,var)      TypedData_Get_Struct(obj, narray_t, &na_data_type, var)
+#define GetNArrayView(obj,var)  TypedData_Get_Struct(obj, narray_view_t, &na_data_type, var)
+#define GetNArrayData(obj,var)  TypedData_Get_Struct(obj, narray_data_t, &na_data_type, var)
+
+#define SDX_IS_STRIDE(x) ((x).stride&0x1)
+#define SDX_IS_INDEX(x)  (!SDX_IS_STRIDE(x))
+#define SDX_GET_STRIDE(x) ((x).stride>>1)
+#define SDX_GET_INDEX(x)  ((x).index)
+
+#define SDX_SET_STRIDE(x,s) ((x).stride=((s)<<1)|0x1)
+#define SDX_SET_INDEX(x,idx) ((x).index=idx)
+
+#define RNARRAY(val)            ((narray_t*)DATA_PTR(val))
+#define RNARRAY_DATA(val)       ((narray_data_t*)DATA_PTR(val))
+#define RNARRAY_VIEW(val)       ((narray_view_t*)DATA_PTR(val))
+#define RNARRAY_FILEMAP(val)    ((narray_filemap_t*)DATA_PTR(val))
+
+#define RNARRAY_NDIM(val)       (RNARRAY(val)->ndim)
+#define RNARRAY_TYPE(val)       (RNARRAY(val)->type)
+#define RNARRAY_FLAG(val)       (RNARRAY(val)->flag)
+#define RNARRAY_SIZE(val)       (RNARRAY(val)->size)
+#define RNARRAY_SHAPE(val)      (RNARRAY(val)->shape)
+#define RNARRAY_REDUCE(val)     (RNARRAY(val)->reduce)
+
+#define RNARRAY_DATA_PTR(val)    (RNARRAY_DATA(val)->ptr)
+#define RNARRAY_VIEW_DATA(val)   (RNARRAY_VIEW(val)->data)
+#define RNARRAY_VIEW_OFFSET(val) (RNARRAY_VIEW(val)->offset)
+#define RNARRAY_VIEW_STRIDX(val) (RNARRAY_VIEW(val)->stridx)
+
+#define NA_NDIM(na)     (((narray_t*)na)->ndim)
+#define NA_TYPE(na)     (((narray_t*)na)->type)
+#define NA_SIZE(na)     (((narray_t*)na)->size)
+#define NA_SHAPE(na)    (((narray_t*)na)->shape)
+#define NA_REDUCE(na)   (((narray_t*)na)->reduce)
+
+#define NA_FLAG(obj)    (na_get_narray_t(obj)->flag)
+#define NA_FLAG0(obj)   (NA_FLAG(obj)[0])
+#define NA_FLAG1(obj)   (NA_FLAG(obj)[1])
+
+#define NA_DATA(na)             ((narray_data_t*)(na))
+#define NA_VIEW(na)             ((narray_view_t*)(na))
+#define NA_DATA_PTR(na)         (NA_DATA(na)->ptr)
+#define NA_VIEW_DATA(na)        (NA_VIEW(na)->data)
+#define NA_VIEW_OFFSET(na)      (NA_VIEW(na)->offset)
+#define NA_VIEW_STRIDX(na)      (NA_VIEW(na)->stridx)
+
+#define NA_IS_INDEX_AT(na,i)    (SDX_IS_INDEX(NA_VIEW_STRIDX(na)[i]))
+#define NA_IS_STRIDE_AT(na,i)   (SDX_IS_STRIDE(NA_VIEW_STRIDX(na)[i]))
+#define NA_INDEX_AT(na,i)       (SDX_GET_INDEX(NA_VIEW_STRIDX(na)[i]))
+#define NA_STRIDE_AT(na,i)      (SDX_GET_STRIDE(NA_VIEW_STRIDX(na)[i]))
+
+#define NA_FILEMAP_PTR(na)      (((narray_filemap_t*)na)->ptr)
+
+
+#define NA_FL0_TEST(x,f) (NA_FLAG0(x)&(f))
+#define NA_FL1_TEST(x,f) (NA_FLAG1(x)&(f))
+
+#define NA_FL0_SET(x,f) do {NA_FLAG0(x) |= (f);} while(0)
+#define NA_FL1_SET(x,f) do {NA_FLAG1(x) |= (f);} while(0)
+
+#define NA_FL0_UNSET(x,f) do {NA_FLAG0(x) &= ~(f);} while(0)
+#define NA_FL1_UNSET(x,f) do {NA_FLAG1(x) &= ~(f);} while(0)
+
+#define NA_FL0_REVERSE(x,f) do {NA_FLAG0(x) ^= (f);} while(0)
+#define NA_FL1_REVERSE(x,f) do {NA_FLAG1(x) ^= (f);} while(0)
+
+
+/* FLAGS
+   - row-major / column-major
+   - Overwrite or not
+   - byteswapp
+   - Extensible?
+   - matrix or not
+*/
+
+#define NA_FL0_BIG_ENDIAN     (0x1<<0)
+#define NA_FL0_COLUMN_MAJOR   (0x1<<1)
+#define NA_FL1_LOCK           (0x1<<0)
+#define NA_FL1_INPLACE        (0x1<<1)
+
+#define TEST_COLUMN_MAJOR(x)   NA_FL0_TEST(x,NA_FL0_COLUMN_MAJOR)
+#define SET_COLUMN_MAJOR(x)    NA_FL0_SET(x,NA_FL0_COLUMN_MAJOR)
+#define UNSET_COLUMN_MAJOR(x)  NA_FL0_UNSET(x,NA_FL0_COLUMN_MAJOR)
+
+#define TEST_ROW_MAJOR(x)      (!TEST_COLUMN_MAJOR(x))
+#define SET_ROW_MAJOR(x)       UNSET_COLUMN_MAJOR(x)
+#define UNSET_ROW_MAJOR(x)     SET_COLUMN_MAJOR(x)
+
+#define TEST_BIG_ENDIAN(x)     NA_FL0_TEST(x,NA_FL0_BIG_ENDIAN)
+#define SET_BIG_ENDIAN(x)      NA_FL0_SET(x,NA_FL0_BIG_ENDIAN)
+#define UNSET_BIG_ENDIAN(x)    NA_FL0_UNSET(x,NA_FL0_BIG_ENDIAN)
+
+#define TEST_LITTLE_ENDIAN(x)  (!TEST_BIG_ENDIAN(x))
+#define SET_LITTLE_ENDIAN(x)   UNSET_BIG_ENDIAN(x)
+#define UNSET_LITTLE_ENDIAN(x) SET_BIG_ENDIAN(x)
+
+#define REVERSE_ENDIAN(x)      NA_FL0_REVERSE((x),NA_FL0_BIG_ENDIAN)
+
+#define TEST_LOCK(x)           NA_FL1_TEST(x,NA_FL1_LOCK)
+#define SET_LOCK(x)            NA_FL1_SET(x,NA_FL1_LOCK)
+#define UNSET_LOCK(x)          NA_FL1_UNSET(x,NA_FL1_LOCK)
+
+#define TEST_INPLACE(x)        NA_FL1_TEST(x,NA_FL1_INPLACE)
+#define SET_INPLACE(x)         NA_FL1_SET(x,NA_FL1_INPLACE)
+#define UNSET_INPLACE(x)       NA_FL1_UNSET(x,NA_FL1_INPLACE)
+
+#ifdef DYNAMIC_ENDIAN
+// not supported
+#else
+#ifdef WORDS_BIGENDIAN
+#define TEST_HOST_ORDER(x)     TEST_BIG_ENDIAN(x)
+#define SET_HOST_ORDER(x)      SET_BIG_ENDIAN(x)
+#define UNSET_HOST_ORDER(x)    UNSET_BIG_ENDIAN(x)
+#define TEST_BYTE_SWAPPED(x)   TEST_LITTLE_ENDIAN(x)
+#define SET_BYTE_SWAPPED(x)    SET_LITTLE_ENDIAN(x)
+#define UNSET_BYTE_SWAPPED(x)  UNSET_LITTLE_ENDIAN(x)
+#define NA_FL0_INIT            NA_FL0_BIG_ENDIAN
+#else // LITTLE ENDIAN
+#define TEST_HOST_ORDER(x)     TEST_LITTLE_ENDIAN(x)
+#define SET_HOST_ORDER(x)      SET_LITTLE_ENDIAN(x)
+#define UNSET_HOST_ORDER(x)    UNSET_LITTLE_ENDIAN(x)
+#define TEST_BYTE_SWAPPED(x)   TEST_BIG_ENDIAN(x)
+#define SET_BYTE_SWAPPED(x)    SET_BIG_ENDIAN(x)
+#define UNSET_BYTE_SWAPPED(x)  UNSET_BIG_ENDIAN(x)
+#define NA_FL0_INIT            0
+#endif
+#endif
+#define NA_FL1_INIT            0
+
+
+#define IsNArray(obj) (rb_obj_is_kind_of(obj,cNArray)==Qtrue)
+
+#define DEBUG_PRINT(v) puts(StringValueCStr(rb_funcall(v,rb_intern("inspect"),0)))
+
+#define NA_IsNArray(obj) \
+  (rb_obj_is_kind_of(obj,cNArray)==Qtrue)
+#define NA_IsArray(obj) \
+  (TYPE(obj)==T_ARRAY || rb_obj_is_kind_of(obj,cNArray)==Qtrue)
+
+#define NUM2REAL(v)  NUM2DBL( rb_funcall((v),na_id_real,0) )
+#define NUM2IMAG(v)  NUM2DBL( rb_funcall((v),na_id_imag,0) )
+
+#define NA_MAX_DIMENSION (int)(sizeof(VALUE)*8-2)
+#define NA_MAX_ELMSZ     65535
+
+typedef unsigned int BIT_DIGIT;
+//#define BYTE_BIT_DIGIT sizeof(BIT_DIGIT)
+#define NB     (sizeof(BIT_DIGIT)*8)
+#define BALL   (~(BIT_DIGIT)0)
+#define SLB(n) (((n)==NB)?~(BIT_DIGIT)0:(~(~(BIT_DIGIT)0<<(n))))
+
+#define ELEMENT_BIT_SIZE  "ELEMENT_BIT_SIZE"
+#define ELEMENT_BYTE_SIZE "ELEMENT_BYTE_SIZE"
+#define CONTIGUOUS_STRIDE "CONTIGUOUS_STRIDE"
+
+
+#ifdef RUBY_INTEGER_UNIFICATION
+#define IS_INTEGER_CLASS(c) ((c)==rb_cInteger)
+#else
+#define IS_INTEGER_CLASS(c) ((c)==rb_cFixnum||(c)==rb_cBignum)
+#endif
+
+#include "numo/ndloop.h"
+#include "numo/intern.h"
+
+#if defined(__cplusplus)
+#if 0
+{ /* satisfy cc-mode */
+#endif
+}  /* extern "C" { */
+#endif
+
+#endif /* ifndef NARRAY_H */
diff --git a/ext/numo/narray/numo/ndloop.h b/ext/numo/narray/numo/ndloop.h
new file mode 100644
index 0000000..0b387b4
--- /dev/null
+++ b/ext/numo/narray/numo/ndloop.h
@@ -0,0 +1,94 @@
+/*
+  ndloop.h
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+#ifndef NDLOOP_H
+#define NDLOOP_H
+
+typedef struct NA_LOOP_ITER {
+    ssize_t    pos; // - required for each dimension.
+    ssize_t    step;
+    size_t    *idx;
+} na_loop_iter_t;
+
+typedef struct NA_LOOP_ARGS {
+    VALUE    value;
+    ssize_t  elmsz;
+    char    *ptr;
+    //char    *buf_ptr;  //
+    int      ndim;       // required for each argument.
+    // ssize_t pos; - not required here.
+    size_t  *shape;
+    na_loop_iter_t *iter;  // moved from na_loop_t
+} na_loop_args_t;
+
+// pass this structure to user iterator
+typedef struct NA_LOOP {
+    int  narg;
+    int  ndim;             // n of user dimention  - required for each iterator.
+    size_t *n;             // n of elements for each dim (=shape)
+    na_loop_args_t *args;  // for each arg
+    VALUE  option;
+    void  *opt_ptr;
+    VALUE  err_type;
+} na_loop_t;
+
+
+// ------------------ ndfunc -------------------------------------------
+
+#define NDF_HAS_LOOP            (1<<0) // x[i]
+#define NDF_STRIDE_LOOP         (1<<1) // *(x+stride*i)
+#define NDF_INDEX_LOOP          (1<<2) // *(x+idx[i])
+#define NDF_KEEP_DIM            (1<<3)
+#define NDF_INPLACE             (1<<4)
+#define NDF_ACCEPT_BYTESWAP     (1<<5)
+
+#define NDF_FLAT_REDUCE         (1<<6)
+#define NDF_EXTRACT             (1<<7)
+#define NDF_CUM                 (1<<8)
+
+#define FULL_LOOP       (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP|NDF_INPLACE)
+#define FULL_LOOP_NIP   (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP)
+#define STRIDE_LOOP     (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INPLACE)
+#define STRIDE_LOOP_NIP (NDF_HAS_LOOP|NDF_STRIDE_LOOP)
+#define NO_LOOP         0
+
+#define OVERWRITE Qtrue // used for CASTABLE(t)
+
+#define NDF_TEST(nf,fl)  ((nf)->flag & (fl))
+#define NDF_SET(nf,fl)  {(nf)->flag |= (fl);}
+
+#define NDF_ARG_READ_ONLY   1
+#define NDF_ARG_WRITE_ONLY  2
+#define NDF_ARG_READ_WRITE  3
+
+// type of user function
+typedef void (*na_iter_func_t) _((na_loop_t *const));
+typedef VALUE (*na_text_func_t) _((char *ptr, size_t pos, VALUE opt));
+//typedef void (*) void (*loop_func)(ndfunc_t*, na_md_loop_t*))
+
+
+typedef struct NDF_ARG_IN {
+    VALUE   type;    // argument types
+    int     dim;     // # of dimension of argument handled by user function
+                     // if dim==-1, reduce dimension
+} ndfunc_arg_in_t;
+
+typedef struct NDF_ARG_OUT {
+    VALUE   type;    // argument types
+    int     dim;     // # of dimension of argument handled by user function
+    size_t *shape;
+} ndfunc_arg_out_t;
+
+// spec of user function
+typedef struct NDFUNCTION {
+    na_iter_func_t func;    // user function
+    unsigned int flag;      // what kind of loop user function supports
+    int nin;                // # of arguments
+    int nout;               // # of results
+    ndfunc_arg_in_t *ain;   // spec of input arguments
+    ndfunc_arg_out_t *aout; // spec of output result
+} ndfunc_t;
+
+#endif /* NDLOOP_H */
diff --git a/ext/numo/narray/numo/template.h b/ext/numo/narray/numo/template.h
new file mode 100644
index 0000000..2fd6e0e
--- /dev/null
+++ b/ext/numo/narray/numo/template.h
@@ -0,0 +1,136 @@
+/*
+  template.h
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+#ifndef TEMPLATE_H
+#define TEMPLATE_H
+
+#define INIT_COUNTER( lp, c )                   \
+    {   c = (lp)->n[0]; }
+
+#define NDL_CNT(lp) ((lp)->n[0])
+#define NDL_PTR(lp,i) ((lp)->args[i].ptr + (lp)->args[i].iter[0].pos)
+#define NDL_STEP(lp,i) ((lp)->args[i].iter[0].step)
+#define NDL_IDX(lp,i) ((lp)->args[i].iter[0].idx)
+#define NDL_ESZ(lp,i) ((lp)->args[i].elmsz)
+#define NDL_SHAPE(lp,i) ((lp)->args[i].shape)
+
+#define INIT_PTR( lp, i, pt, st )                               \
+    {                                                           \
+        pt = ((lp)->args[i]).ptr + ((lp)->args[i].iter[0]).pos;         \
+        st = ((lp)->args[i].iter[0]).step;                              \
+    }
+
+#define INIT_PTR_IDX( lp, i, pt, st, id )                       \
+    {                                                           \
+        pt = ((lp)->args[i]).ptr + ((lp)->args[i].iter[0]).pos;         \
+        st = ((lp)->args[i].iter[0]).step;                              \
+        id = ((lp)->args[i].iter[0]).idx;                               \
+    }
+
+#define INIT_ELMSIZE( lp, i, es )                               \
+    {                                                           \
+        es = ((lp)->args[i]).elmsz;                             \
+    }
+
+#define INIT_PTR_BIT( lp, i, ad, ps, st )               \
+    {                                                   \
+        ps = ((lp)->args[i].iter[0]).pos;                       \
+        ad = (BIT_DIGIT*)(((lp)->args[i]).ptr) + ps/NB; \
+        ps %= NB;                                       \
+        st = ((lp)->args[i].iter[0]).step;                      \
+    }
+
+#define INIT_PTR_BIT_IDX( lp, i, ad, ps, st, id )       \
+    {                                                   \
+        ps = ((lp)->args[i].iter[0]).pos;                       \
+        ad = (BIT_DIGIT*)(((lp)->args[i]).ptr) + ps/NB; \
+        ps %= NB;                                       \
+        st = ((lp)->args[i].iter[0]).step;                      \
+        id = ((lp)->args[i].iter[0]).idx;                       \
+    }
+
+#define GET_DATA( ptr, type, val )                 \
+    {                                              \
+        val = *(type*)(ptr);                       \
+    }
+
+#define SET_DATA( ptr, type, val )                 \
+    {                                              \
+        *(type*)(ptr) = val;                       \
+    }
+
+#define GET_DATA_STRIDE( ptr, step, type, val )    \
+    {                                              \
+        val = *(type*)(ptr);                       \
+        ptr += step;                               \
+    }
+
+#define GET_DATA_INDEX( ptr, idx, type, val )     \
+    {                                           \
+        val = *(type*)(ptr + *idx);             \
+        idx++;                                  \
+    }
+
+#define SET_DATA_STRIDE( ptr, step, type, val ) \
+    {                                           \
+        *(type*)(ptr) = val;                    \
+        ptr += step;                            \
+    }
+
+#define SET_DATA_INDEX( ptr, idx, type, val )   \
+    {                                           \
+        *(type*)(ptr + *idx) = val;             \
+        idx++;                                  \
+    }
+
+#define LOAD_BIT( adr, pos, val )                       \
+    {                                                   \
+        size_t dig = (pos) / NB;                        \
+        int    bit = (pos) % NB;                        \
+        val = (((BIT_DIGIT*)(adr))[dig]>>(bit)) & 1u;   \
+    }
+
+#define LOAD_BIT_STEP( adr, pos, step, idx, val )       \
+    {                                                   \
+        size_t dig; int bit;                            \
+        if (idx) {                                      \
+            dig = ((pos) + *(idx)) / NB;                \
+            bit = ((pos) + *(idx)) % NB;                \
+            idx++;                                      \
+        } else {                                        \
+            dig = (pos) / NB;                           \
+            bit = (pos) % NB;                           \
+            pos += step;                                \
+        }                                               \
+        val = (((BIT_DIGIT*)(adr))[dig]>>bit) & 1u;     \
+    }
+
+#define STORE_BIT(adr,pos,val)                  \
+    {                                           \
+        size_t dig = (pos) / NB;                \
+        int    bit = (pos) % NB;                \
+        ((BIT_DIGIT*)(adr))[dig] =              \
+            (((BIT_DIGIT*)(adr))[dig] & ~(1u<<(bit))) | ((val)<<(bit)); \
+    }
+// val -> val&1 ??
+
+#define STORE_BIT_STEP( adr, pos, step, idx, val )\
+    {                                           \
+        size_t dig; int bit;                    \
+        if (idx) {                              \
+            dig = ((pos) + *(idx)) / NB;        \
+            bit = ((pos) + *(idx)) % NB;        \
+            idx++;                              \
+        } else {                                \
+            dig = (pos) / NB;                   \
+            bit = (pos) % NB;                   \
+            pos += step;                        \
+        }                                       \
+        ((BIT_DIGIT*)(adr))[dig] =              \
+            (((BIT_DIGIT*)(adr))[dig] & ~(1u<<(bit))) | ((val)<<(bit)); \
+    }
+// val -> val&1 ??
+
+#endif /* ifndef NARRAY_H */
diff --git a/ext/numo/narray/numo/types/bit.h b/ext/numo/narray/numo/types/bit.h
new file mode 100644
index 0000000..1cf3e94
--- /dev/null
+++ b/ext/numo/narray/numo/types/bit.h
@@ -0,0 +1,33 @@
+typedef BIT_DIGIT dtype;
+typedef BIT_DIGIT rtype;
+#define cT  numo_cBit
+#define cRT cT
+
+#define m_zero 0
+#define m_one  1
+
+#define m_abs(x)     (x)
+#define m_sign(x)    (((x)==0) ? 0:1)
+
+#define m_from_double(x) (((x)==0) ? 0 : 1)
+#define m_from_real(x) (((x)==0) ? 0 : 1)
+#define m_data_to_num(x) INT2FIX(x)
+#define m_sprintf(s,x)   sprintf(s,"%1d",(int)(x))
+
+#define m_copy(x)  (x)
+#define m_not(x)   (~(x))
+#define m_and(x,y) ((x)&(y))
+#define m_or(x,y)  ((x)|(y))
+#define m_xor(x,y) ((x)^(y))
+#define m_eq(x,y)  (~((x)^(y)))
+#define m_count_true(x)  ((x)!=0)
+#define m_count_false(x) ((x)==0)
+
+static inline BIT_DIGIT m_num_to_data(VALUE num) {
+    if (RTEST(num)) {
+        if (!RTEST(rb_equal(num,INT2FIX(0)))) {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/ext/numo/narray/numo/types/complex.h b/ext/numo/narray/numo/types/complex.h
new file mode 100644
index 0000000..9e476fe
--- /dev/null
+++ b/ext/numo/narray/numo/types/complex.h
@@ -0,0 +1,409 @@
+/*
+  complex.h
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+
+
+static inline dtype c_new(rtype r, rtype i) {
+    dtype z;
+    REAL(z) = r;
+    IMAG(z) = i;
+    return z;
+}
+
+static inline dtype c_set_real(dtype x, rtype r) {
+    REAL(x)=r;
+    return x;
+}
+
+static inline dtype c_set_imag(dtype x, rtype i) {
+    IMAG(x)=i;
+    return x;
+}
+
+static inline VALUE COMP2NUM(dtype x) {
+    VALUE v;
+    v = rb_funcall(rb_intern("Kernel"), rb_intern("Complex"), 2,
+                   rb_float_new(REAL(x)), rb_float_new(IMAG(x)));
+    return v;
+}
+
+static inline dtype NUM2COMP(VALUE v) {
+    dtype z;
+    REAL(z) = NUM2DBL(rb_funcall(v,id_real,0));
+    IMAG(z) = NUM2DBL(rb_funcall(v,id_imag,0));
+    return z;
+}
+
+#define c_is_zero(x) (REAL(x)==0 && IMAG(x)==0)
+#define c_eq(x,y) (REAL(x)==REAL(y) && IMAG(x)==IMAG(y))
+#define c_ne(x,y) (REAL(x)!=REAL(y) || IMAG(x)!=IMAG(y))
+#define c_isnan(x) (isnan(REAL(x)) || isnan(IMAG(x)))
+#define c_isinf(x) (isinf(REAL(x)) || isinf(IMAG(x)))
+#define c_isposinf(x) ((isinf(REAL(x)) && signbit(REAL(x))==0) || \
+                       (isinf(IMAG(x)) && signbit(IMAG(x))==0))
+#define c_isneginf(x) ((isinf(REAL(x)) && signbit(REAL(x))) || \
+                       (isinf(IMAG(x)) && signbit(IMAG(x))))
+#define c_isfinite(x) (isfinite(REAL(x)) && isfinite(IMAG(x)))
+
+static inline dtype c_zero() {
+    dtype z;
+    REAL(z) = 0;
+    IMAG(z) = 0;
+    return z;
+}
+
+static inline dtype c_one() {
+    dtype z;
+    REAL(z) = 1;
+    IMAG(z) = 0;
+    return z;
+}
+
+static inline dtype c_minus(dtype x) {
+    dtype z;
+    REAL(z) = -REAL(x);
+    IMAG(z) = -IMAG(x);
+    return z;
+}
+
+static inline dtype c_im(dtype x) {
+    dtype z;
+    REAL(z) = -IMAG(x);
+    IMAG(z) = REAL(x);
+    return z;
+}
+
+static inline dtype c_add(dtype x, dtype y) {
+    dtype z;
+    REAL(z) = REAL(x)+REAL(y);
+    IMAG(z) = IMAG(x)+IMAG(y);
+    return z;
+}
+
+static inline dtype c_sub(dtype x, dtype y) {
+    dtype z;
+    REAL(z) = REAL(x)-REAL(y);
+    IMAG(z) = IMAG(x)-IMAG(y);
+    return z;
+}
+
+
+static inline dtype c_mul(dtype x, dtype y) {
+    dtype z;
+    REAL(z) = REAL(x)*REAL(y)-IMAG(x)*IMAG(y);
+    IMAG(z) = REAL(x)*IMAG(y)+IMAG(x)*REAL(y);
+    return z;
+}
+
+static inline dtype c_mul_r(dtype x, rtype y) {
+    dtype z;
+    REAL(z) = REAL(x)*y;
+    IMAG(z) = IMAG(x)*y;
+    return z;
+}
+
+static inline dtype c_div(dtype x, dtype y) {
+    dtype z;
+    rtype s,yr,yi;
+    s  = r_hypot(REAL(y),IMAG(y));
+    yr = REAL(y)/s;
+    yi = IMAG(y)/s;
+    REAL(z) = (REAL(x)*yr+IMAG(x)*yi)/s;
+    IMAG(z) = (IMAG(x)*yr-REAL(x)*yi)/s;
+    return z;
+}
+
+static inline dtype c_div_r(dtype x, rtype y) {
+    dtype z;
+    REAL(z) = REAL(x)/y;
+    IMAG(z) = IMAG(x)/y;
+    return z;
+}
+
+static inline dtype c_reciprocal(dtype x) {
+    dtype z;
+    if ( r_abs(REAL(x)) > r_abs(IMAG(x)) ) {
+        IMAG(z) = IMAG(x)/REAL(x);
+        REAL(z) = (1+IMAG(z)*IMAG(z))*REAL(x);
+        IMAG(z) /= -REAL(z);
+        REAL(z) = 1/REAL(z);
+    } else {
+        REAL(z) = REAL(x)/IMAG(x);
+        IMAG(z) = (1+REAL(z)*REAL(z))*IMAG(x);
+        REAL(z) /= IMAG(z);
+        IMAG(z) = -1/IMAG(z);
+    }
+    return z;
+}
+
+static inline dtype c_square(dtype x) {
+    dtype z;
+    REAL(z) = REAL(x)*REAL(x)-IMAG(x)*IMAG(x);
+    IMAG(z) = 2*REAL(x)*IMAG(x);
+    return z;
+}
+
+static inline dtype c_sqrt(dtype x) {
+    dtype z;
+    rtype xr, xi, r;
+    xr = REAL(x)/2;
+    xi = IMAG(x)/2;
+    r  = r_hypot(xr,xi);
+    if (xr>0) {
+        REAL(z) = sqrt(r+xr);
+        IMAG(z) = xi/REAL(z);
+    } else if ( (r-=xr)!=0 ) {
+        IMAG(z) = (xi>=0) ? sqrt(r):-sqrt(r);
+        REAL(z) = xi/IMAG(z);
+    } else {
+        REAL(z) = IMAG(z) = 0;
+    }
+    return z;
+}
+
+static inline dtype c_log(dtype x) {
+    dtype z;
+    REAL(z) = r_log(r_hypot(REAL(x),IMAG(x)));
+    IMAG(z) = r_atan2(IMAG(x),REAL(x));
+    return z;
+}
+
+static inline dtype c_log2(dtype x) {
+    dtype z;
+    z = c_log(x);
+    z = c_mul_r(x,M_LOG2E);
+    return z;
+}
+
+static inline dtype c_log10(dtype x) {
+    dtype z;
+    z = c_log(x);
+    z = c_mul_r(x,M_LOG10E);
+    return z;
+}
+
+static inline dtype c_exp(dtype x) {
+    dtype z;
+    rtype a = r_exp(REAL(x));
+    REAL(z) = a*r_cos(IMAG(x));
+    IMAG(z) = a*r_sin(IMAG(x));
+    return z;
+}
+
+static inline dtype c_exp2(dtype x) {
+    dtype z;
+    rtype a = r_exp(REAL(x)*M_LN2);
+    REAL(z) = a*r_cos(IMAG(x));
+    IMAG(z) = a*r_sin(IMAG(x));
+    return z;
+}
+
+static inline dtype c_exp10(dtype x) {
+    dtype z;
+    rtype a = r_exp(REAL(x)*M_LN10);
+    REAL(z) = a*r_cos(IMAG(x));
+    IMAG(z) = a*r_sin(IMAG(x));
+    return z;
+}
+
+static inline dtype c_sin(dtype x) {
+    dtype z;
+    REAL(z) = r_sin(REAL(x))*r_cosh(IMAG(x));
+    IMAG(z) = r_cos(REAL(x))*r_sinh(IMAG(x));
+    return z;
+}
+
+static inline dtype c_sinh(dtype x) {
+    dtype z;
+    REAL(z) = r_sinh(REAL(x))*r_cos(IMAG(x));
+    IMAG(z) = r_cosh(REAL(x))*r_sin(IMAG(x));
+    return z;
+}
+
+static inline dtype c_cos(dtype x) {
+    dtype z;
+    REAL(z) = r_cos(REAL(x))*r_cosh(IMAG(x));
+    IMAG(z) = -r_sin(REAL(x))*r_sinh(IMAG(x));
+    return z;
+}
+
+static inline dtype c_cosh(dtype x) {
+    dtype z;
+    REAL(z) = r_cosh(REAL(x))*r_cos(IMAG(x));
+    IMAG(z) = r_sinh(REAL(x))*r_sin(IMAG(x));
+    return z;
+}
+
+static inline dtype c_tan(dtype x) {
+    dtype z;
+    rtype c, d;
+    if (r_abs(IMAG(x))<1) {
+        c = r_cos(REAL(x));
+        d = r_sinh(IMAG(x));
+        d = c*c + d*d;
+        REAL(z) = 0.5*r_sin(2*REAL(x))/d;
+        IMAG(z) = 0.5*r_sinh(2*IMAG(x))/d;
+    } else {
+        d = r_exp(-IMAG(x));
+        c = 2*d/(1-d*d);
+        c = c*c;
+        d = r_cos(REAL(x));
+        d = 1.0 + d*d*c;
+        REAL(z) = 0.5*r_sin(2*REAL(x))*c/d;
+        IMAG(z) = 1/r_tanh(IMAG(x))/d;
+    }
+    return z;
+}
+
+static inline dtype c_tanh(dtype x) {
+    dtype z;
+    rtype c, d, s;
+    c = r_cos(IMAG(x));
+    s = r_sinh(REAL(x));
+    d = c*c + s*s;
+    if (r_abs(REAL(x))<1) {
+        REAL(z) = s*r_cosh(REAL(x))/d;
+        IMAG(z) = 0.5*r_sin(2*IMAG(x))/d;
+    } else {
+        c = c / s;
+        c = 1 + c*c;
+        REAL(z) = 1/(r_tanh(REAL(x))*c);
+        IMAG(z) = 0.5*r_sin(2*IMAG(x))/d;
+    }
+    return z;
+}
+
+static inline dtype c_asin(dtype x) {
+    dtype z, y;
+    y = c_square(x);
+    REAL(y) = 1-REAL(y);
+    IMAG(y) = -IMAG(y);
+    y = c_sqrt(y);
+    REAL(y) -= IMAG(x);
+    IMAG(y) += REAL(x);
+    y = c_log(y);
+    REAL(z) = IMAG(y);
+    IMAG(z) = -REAL(y);
+    return z;
+}
+
+static inline dtype c_asinh(dtype x) {
+    dtype z, y;
+    y = c_square(x);
+    REAL(y) += 1;
+    y = c_sqrt(y);
+    REAL(y) += REAL(x);
+    IMAG(y) += IMAG(x);
+    z = c_log(y);
+    return z;
+}
+
+static inline dtype c_acos(dtype x) {
+    dtype z, y;
+    y = c_square(x);
+    REAL(y) = 1-REAL(y);
+    IMAG(y) = -IMAG(y);
+    y = c_sqrt(y);
+    REAL(z) = REAL(x)-IMAG(y);
+    IMAG(z) = IMAG(x)+REAL(y);
+    y = c_log(z);
+    REAL(z) = IMAG(y);
+    IMAG(z) = -REAL(y);
+    return z;
+}
+
+static inline dtype c_acosh(dtype x) {
+    dtype z, y;
+    y = c_square(x);
+    REAL(y) -= 1;
+    y = c_sqrt(y);
+    REAL(y) += REAL(x);
+    IMAG(y) += IMAG(x);
+    z = c_log(y);
+    return z;
+}
+
+static inline dtype c_atan(dtype x) {
+    dtype z, y;
+    REAL(y) = -REAL(x);
+    IMAG(y) = 1-IMAG(x);
+    REAL(z) = REAL(x);
+    IMAG(z) = 1+IMAG(x);
+    y = c_div(z,y);
+    y = c_log(y);
+    REAL(z) = -IMAG(y)/2;
+    IMAG(z) = REAL(y)/2;
+    return z;
+}
+
+static inline dtype c_atanh(dtype x) {
+    dtype z, y;
+    REAL(y) = 1-REAL(x);
+    IMAG(y) = -IMAG(x);
+    REAL(z) = 1+REAL(x);
+    IMAG(z) = IMAG(x);
+    y = c_div(z,y);
+    y = c_log(y);
+    REAL(z) = REAL(y)/2;
+    IMAG(z) = IMAG(y)/2;
+    return z;
+}
+
+static inline dtype c_pow(dtype x, dtype y)
+{
+    dtype z;
+    if (c_is_zero(y)) {
+        z = c_one();
+    } else if (c_is_zero(x) && REAL(y)>0 && IMAG(y)==0) {
+        z = c_zero();
+    } else {
+        z = c_log(x);
+        z = c_mul(y,z);
+        z = c_exp(z);
+    }
+    return z;
+}
+
+static inline dtype c_pow_int(dtype x, int p)
+{
+    dtype z = c_one();
+    if (p<0) {
+	x = c_pow_int(x,-p);
+	return c_reciprocal(x);
+    }
+    if (p==2) {return c_square(x);}
+    if (p&1) {z = x;}
+    p >>= 1;
+    while (p) {
+	x = c_square(x);
+	if (p&1) z = c_mul(z,x);
+	p >>= 1;
+    }
+    return z;
+}
+
+static inline dtype c_cbrt(dtype x) {
+    dtype z;
+    z = c_log(x);
+    z = c_div_r(z,3);
+    z = c_exp(z);
+    return z;
+}
+
+static inline rtype c_abs(dtype x) {
+    return r_hypot(REAL(x),IMAG(x));
+}
+
+static inline rtype c_abs_square(dtype x) {
+    return REAL(x)*REAL(x)+IMAG(x)*IMAG(x);
+}
+
+
+
+/*
+static inline rtype c_hypot(dtype x, dtype y) {
+    return r_hypot(c_abs(x),c_abs(y));
+}
+*/
diff --git a/ext/numo/narray/numo/types/complex_macro.h b/ext/numo/narray/numo/types/complex_macro.h
new file mode 100644
index 0000000..7adadbb
--- /dev/null
+++ b/ext/numo/narray/numo/types/complex_macro.h
@@ -0,0 +1,375 @@
+#include "float_def.h"
+
+EXTERN double round(double);
+EXTERN double log2(double);
+EXTERN double exp2(double);
+EXTERN double exp10(double);
+
+#define r_abs(x)   fabs(x)
+#define r_sqrt(x)  sqrt(x)
+#define r_exp(x)   exp(x)
+#define r_log(x)   log(x)
+#define r_sin(x)   sin(x)
+#define r_cos(x)   cos(x)
+#define r_sinh(x)  sinh(x)
+#define r_cosh(x)  cosh(x)
+#define r_tanh(x)  tanh(x)
+#define r_atan2(y,x)  atan2(y,x)
+#define r_hypot(x,y)  hypot(x,y)
+
+#include "complex.h"
+
+static inline dtype c_from_scomplex(scomplex x) {
+    dtype z;
+    REAL(z) = REAL(x);
+    IMAG(z) = IMAG(x);
+    return z;
+}
+
+static inline dtype c_from_dcomplex(dcomplex x) {
+    dtype z;
+    REAL(z) = REAL(x);
+    IMAG(z) = IMAG(x);
+    return z;
+}
+
+/* --------------------------- */
+
+#define m_zero c_zero()
+#define m_one  c_one()
+
+#define m_num_to_data(x) NUM2COMP(x)
+#define m_data_to_num(x) COMP2NUM(x)
+
+#define m_from_double(x) c_new(x,0)
+#define m_from_real(x)   c_new(x,0)
+#define m_from_scomplex(x) c_from_scomplex(x)
+#define m_from_dcomplex(x) c_from_dcomplex(x)
+
+#define m_extract(x) COMP2NUM(*(dtype*)x)
+
+#define m_real(x)  REAL(x)
+#define m_imag(x)  IMAG(x)
+#define m_set_real(x,y)  c_set_real(x,y)
+#define m_set_imag(x,y)  c_set_imag(x,y)
+
+#define m_add(x,y) c_add(x,y)
+#define m_sub(x,y) c_sub(x,y)
+#define m_mul(x,y) c_mul(x,y)
+#define m_div(x,y) c_div(x,y)
+#define m_mod(x,y) c_mod(x,y)
+#define m_pow(x,y) c_pow(x,y)
+#define m_pow_int(x,y) c_pow_int(x,y)
+
+#define m_abs(x)   c_abs(x)
+#define m_minus(x) c_minus(x)
+#define m_reciprocal(x) c_reciprocal(x)
+#define m_square(x) c_square(x)
+#define m_floor(x) c_new(floor(REAL(x)),floor(IMAG(x)))
+#define m_round(x) c_new(round(REAL(x)),round(IMAG(x)))
+#define m_ceil(x)  c_new(ceil(REAL(x)),ceil(IMAG(x)))
+#define m_trunc(x) c_new(trunc(REAL(x)),trunc(IMAG(x)))
+#define m_rint(x)  c_new(rint(REAL(x)),rint(IMAG(x)))
+#define m_sign(x)  c_new( \
+ ((REAL(x)==0) ? 0.0:((REAL(x)>0) ? 1.0:((REAL(x)<0) ? -1.0:REAL(x)))), \
+ ((IMAG(x)==0) ? 0.0:((IMAG(x)>0) ? 1.0:((IMAG(x)<0) ? -1.0:IMAG(x)))))
+#define m_copysign(x,y) c_new(copysign(REAL(x),REAL(y)),copysign(IMAG(x),IMAG(y)))
+
+#define m_im(x)    c_im(x)
+#define m_conj(x)  c_new(REAL(x),-IMAG(x))
+#define m_arg(x)   atan2(IMAG(x),REAL(x))
+
+#define m_eq(x,y) c_eq(x,y)
+#define m_ne(x,y) c_ne(x,y)
+#define m_nearly_eq(x,y) c_nearly_eq(x,y)
+
+#define m_isnan(x)    c_isnan(x)
+#define m_isinf(x)    c_isinf(x)
+#define m_isposinf(x) c_isposinf(x)
+#define m_isneginf(x) c_isneginf(x)
+#define m_isfinite(x) c_isfinite(x)
+
+#define m_sprintf(s,x) sprintf(s,"%g%+gi",REAL(x),IMAG(x))
+
+#define m_sqrt(x)    c_sqrt(x)
+#define m_cbrt(x)    c_cbrt(x)
+#define m_log(x)     c_log(x)
+#define m_log2(x)    c_log2(x)
+#define m_log10(x)   c_log10(x)
+#define m_exp(x)     c_exp(x)
+#define m_exp2(x)    c_exp2(x)
+#define m_exp10(x)   c_exp10(x)
+#define m_sin(x)     c_sin(x)
+#define m_cos(x)     c_cos(x)
+#define m_tan(x)     c_tan(x)
+#define m_asin(x)    c_asin(x)
+#define m_acos(x)    c_acos(x)
+#define m_atan(x)    c_atan(x)
+#define m_sinh(x)    c_sinh(x)
+#define m_cosh(x)    c_cosh(x)
+#define m_tanh(x)    c_tanh(x)
+#define m_asinh(x)   c_asinh(x)
+#define m_acosh(x)   c_acosh(x)
+#define m_atanh(x)   c_atanh(x)
+#define m_hypot(x,y) c_hypot(x,y)
+#define m_sinc(x)    c_div(c_sin(x),x)
+
+#define m_sum_init INT2FIX(0)
+#define m_mulsum_init INT2FIX(0)
+
+#define m_mulsum(x,y,z) {z = m_add(m_mul(x,y),z);}
+#define m_mulsum_nan(x,y,z) {            \
+        if(!m_isnan(x) && !m_isnan(y)) { \
+            z = m_add(m_mul(x,y),z);     \
+        }}
+
+#define m_cumsum(x,y) {(x)=m_add(x,y);}
+#define m_cumsum_nan(x,y) {       \
+        if (m_isnan(x)) {         \
+            (x) = (y);            \
+        } else if (!m_isnan(y)) { \
+            (x) = m_add(x,y);     \
+        }}
+
+#define m_cumprod(x,y) {(x)=m_mul(x,y);}
+#define m_cumprod_nan(x,y) {      \
+        if (m_isnan(x)) {         \
+            (x) = (y);            \
+        } else if (!m_isnan(y)) { \
+            (x) = m_mul(x,y);     \
+        }}
+
+static inline dtype f_sum(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x,y;
+
+    y = c_zero();
+    for (; i--;) {
+        x = *(dtype*)p;
+        y = c_add(x,y);
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_sum_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x,y;
+
+    y = c_zero();
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (!c_isnan(x)) {
+            y = c_add(x,y);
+        }
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_kahan_sum(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x;
+    volatile dtype y,t,r;
+
+    y = c_zero();
+    r = c_zero();
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (fabs(REAL(x)) > fabs(REAL(y))) {
+            double z=REAL(x); REAL(x)=REAL(y); REAL(y)=z;
+        }
+        if (fabs(IMAG(x)) > fabs(IMAG(y))) {
+            double z=IMAG(x); IMAG(x)=IMAG(y); IMAG(y)=z;
+        }
+        r = c_add(x, r);
+        t = y;
+        y = c_add(r, y);
+        t = c_sub(y, t);
+        r = c_sub(r, t);
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_kahan_sum_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x;
+    volatile dtype y,t,r;
+
+    y = c_zero();
+    r = c_zero();
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (!c_isnan(x)) {
+            if (fabs(REAL(x)) > fabs(REAL(y))) {
+                double z=REAL(x); REAL(x)=REAL(y); REAL(y)=z;
+            }
+            if (fabs(IMAG(x)) > fabs(IMAG(y))) {
+                double z=IMAG(x); IMAG(x)=IMAG(y); IMAG(y)=z;
+            }
+            r = c_add(x, r);
+            t = y;
+            y = c_add(r, y);
+            t = c_sub(y, t);
+            r = c_sub(r, t);
+        }
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_prod(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x,y;
+
+    y = c_one();
+    for (; i--;) {
+        x = *(dtype*)p;
+        y = c_mul(x,y);
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_prod_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x,y;
+
+    y = c_one();
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (!c_isnan(x)) {
+            y = c_mul(x,y);
+        }
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_mean(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,y;
+
+    y = c_zero();
+    for (; i--;) {
+        x = *(dtype*)p;
+        y = c_add(x,y);
+        count++;
+        p += stride;
+    }
+    return c_div_r(y,count);
+}
+
+static inline dtype f_mean_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,y;
+
+    y = c_zero();
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (!c_isnan(x)) {
+            y = c_add(x,y);
+            count++;
+        }
+        p += stride;
+    }
+    return c_div_r(y,count);
+}
+
+static inline rtype f_var(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,m;
+    rtype y=0;
+
+    m = f_mean(n,p,stride);
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        y += c_abs_square(c_sub(x,m));
+        count++;
+        p += stride;
+    }
+    return y/(count-1);
+}
+
+static inline rtype f_var_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,m;
+    rtype y=0;
+
+    m = f_mean_nan(n,p,stride);
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (!c_isnan(x)) {
+            y += c_abs_square(c_sub(x,m));
+            count++;
+        }
+        p += stride;
+    }
+    return y/(count-1);
+}
+
+static inline rtype f_stddev(size_t n, char *p, ssize_t stride)
+{
+    return r_sqrt(f_var(n,p,stride));
+}
+
+static inline rtype f_stddev_nan(size_t n, char *p, ssize_t stride)
+{
+    return r_sqrt(f_var_nan(n,p,stride));
+}
+
+static inline rtype f_rms(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x;
+    rtype y=0;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        y += c_abs_square(x);
+        count++;
+        p += stride;
+    }
+    return r_sqrt(y/count);
+}
+
+static inline rtype f_rms_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x;
+    rtype y=0;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (!c_isnan(x)) {
+            y += c_abs_square(x);
+            count++;
+        }
+        p += stride;
+    }
+    return r_sqrt(y/count);
+}
+
+static inline dtype f_seq(dtype x, dtype y, double c)
+{
+    return c_add(x,c_mul_r(y,c));
+}
diff --git a/ext/numo/narray/numo/types/dcomplex.h b/ext/numo/narray/numo/types/dcomplex.h
new file mode 100644
index 0000000..3b3cb54
--- /dev/null
+++ b/ext/numo/narray/numo/types/dcomplex.h
@@ -0,0 +1,44 @@
+typedef dcomplex dtype;
+typedef double rtype;
+#define cT  numo_cDComplex
+#define cRT numo_cDFloat
+#define mTM numo_mDComplexMath
+
+#include "complex_macro.h"
+
+static inline bool c_nearly_eq(dtype x, dtype y) {
+    return c_abs(c_sub(x,y)) <= (c_abs(x)+c_abs(y))*DBL_EPSILON*2;
+}
+
+#ifdef SFMT_H
+/* generates a random number on [0,1)-real-interval */
+inline static dtype m_rand(dtype max)
+{
+    dtype z;
+    REAL(z) = genrand_res53_mix() * REAL(max);
+    IMAG(z) = genrand_res53_mix() * IMAG(max);
+    return z;
+}
+
+/* generates random numbers from the normal distribution
+   using Box-Muller Transformation.
+ */
+inline static void m_rand_norm(dtype mu, rtype sigma, dtype *a0)
+{
+    rtype x1, x2, w;
+    do {
+	x1 = genrand_res53_mix();
+	x1 = x1*2-1;
+	x2 = genrand_res53_mix();
+	x2 = x2*2-1;
+	w = x1 * x1 + x2 * x2;
+    } while (w>=1);
+    w = sqrt( (-2*log(w)) / w );
+    REAL(*a0) = x1*w * sigma + REAL(mu);
+    IMAG(*a0) = x2*w * sigma + IMAG(mu);
+}
+#endif
+
+#define M_EPSILON rb_float_new(2.2204460492503131e-16)
+#define M_MIN     rb_float_new(2.2250738585072014e-308)
+#define M_MAX     rb_float_new(1.7976931348623157e+308)
diff --git a/ext/numo/narray/numo/types/dfloat.h b/ext/numo/narray/numo/types/dfloat.h
new file mode 100644
index 0000000..42e68a3
--- /dev/null
+++ b/ext/numo/narray/numo/types/dfloat.h
@@ -0,0 +1,42 @@
+typedef double dtype;
+typedef double rtype;
+#define cT  numo_cDFloat
+#define cRT numo_cDFloat
+#define mTM numo_mDFloatMath
+
+#include "float_macro.h"
+
+#ifdef SFMT_H
+/* generates a random number on [0,1)-real-interval */
+inline static dtype m_rand(dtype max)
+{
+    return genrand_res53_mix() * max;
+}
+
+/* generates random numbers from the normal distribution
+   using Box-Muller Transformation.
+ */
+inline static void m_rand_norm(dtype mu, dtype sigma, dtype *a0, dtype *a1)
+{
+    dtype x1, x2, w;
+    do {
+	x1 = genrand_res53_mix();
+	x1 = x1*2-1;
+	x2 = genrand_res53_mix();
+	x2 = x2*2-1;
+	w = x1 * x1 + x2 * x2;
+    } while (w>=1);
+    w = sqrt( (-2*log(w)) / w );
+    if (a0) {*a0 = x1*w * sigma + mu;}
+    if (a1) {*a1 = x2*w * sigma + mu;}
+}
+#endif
+
+#define m_min_init numo_dfloat_new_dim0(0.0/0.0)
+#define m_max_init numo_dfloat_new_dim0(0.0/0.0)
+#define m_extract(x) rb_float_new(*(double*)x)
+#define m_nearly_eq(x,y) (fabs(x-y)<=(fabs(x)+fabs(y))*DBL_EPSILON*2)
+
+#define M_EPSILON rb_float_new(2.2204460492503131e-16)
+#define M_MIN     rb_float_new(2.2250738585072014e-308)
+#define M_MAX     rb_float_new(1.7976931348623157e+308)
diff --git a/ext/numo/narray/numo/types/float_def.h b/ext/numo/narray/numo/types/float_def.h
new file mode 100644
index 0000000..a9d22f1
--- /dev/null
+++ b/ext/numo/narray/numo/types/float_def.h
@@ -0,0 +1,34 @@
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.2204460492503131e-16
+#endif
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.1920928955078125e-07
+#endif
+#ifndef DBL_MAX
+#define DBL_MAX 1.7976931348623157e+308
+#endif
+#ifndef DBL_MAX
+#define DBL_MAX 1.7976931348623157e+308
+#endif
+#ifndef FLT_MIN
+#define FLT_MIN 1.1754943508222875e-38
+#endif
+#ifndef FLT_MAX
+#define FLT_MAX 3.4028234663852886e+38
+#endif
+
+#ifndef M_PI_2
+#define M_PI_2         1.57079632679489661923  /* pi/2 */
+#endif
+#ifndef M_LOG2E
+#define M_LOG2E        1.4426950408889634074   /* log_2 e */
+#endif
+#ifndef M_LOG10E
+#define M_LOG10E       0.43429448190325182765  /* log_10 e */
+#endif
+#ifndef M_LN2
+#define M_LN2          0.69314718055994530942  /* log_e 2 */
+#endif
+#ifndef M_LN10
+#define M_LN10         2.30258509299404568402  /* log_e 10 */
+#endif
diff --git a/ext/numo/narray/numo/types/float_macro.h b/ext/numo/narray/numo/types/float_macro.h
new file mode 100644
index 0000000..3b50e35
--- /dev/null
+++ b/ext/numo/narray/numo/types/float_macro.h
@@ -0,0 +1,186 @@
+#include "float_def.h"
+
+EXTERN double round(double);
+EXTERN double log2(double);
+EXTERN double exp2(double);
+#ifdef HAVE_EXP10
+EXTERN double exp10(double);
+#else
+EXTERN double pow(double, double);
+#endif
+
+#define m_zero 0.0
+#define m_one  1.0
+
+#define m_num_to_data(x) NUM2DBL(x)
+#define m_data_to_num(x) rb_float_new(x)
+
+#define m_from_double(x) (x)
+#define m_from_real(x) (x)
+
+#define m_add(x,y) ((x)+(y))
+#define m_sub(x,y) ((x)-(y))
+#define m_mul(x,y) ((x)*(y))
+#define m_div(x,y) ((x)/(y))
+#define m_div_check(x,y) ((y)==0)
+#define m_mod(x,y) fmod(x,y)
+#define m_divmod(x,y,a,b) {a=(x)/(y); b=m_mod(x,y);}
+#define m_pow(x,y) pow(x,y)
+#define m_pow_int(x,y) pow_int(x,y)
+
+#define m_abs(x)     fabs(x)
+#define m_minus(x)   (-(x))
+#define m_reciprocal(x) (1/(x))
+#define m_square(x)  ((x)*(x))
+#define m_floor(x)   floor(x)
+#define m_round(x)   round(x)
+#define m_ceil(x)    ceil(x)
+#define m_trunc(x)   trunc(x)
+#define m_rint(x)    rint(x)
+#define m_sign(x)    (((x)==0) ? 0.0:(((x)>0) ? 1.0:(((x)<0) ? -1.0:(x))))
+#define m_copysign(x,y) copysign(x,y)
+#define m_signbit(x) signbit(x)
+#define m_modf(x,y,z) {double d; y=modf(x,&d); z=d;}
+
+#define m_eq(x,y) ((x)==(y))
+#define m_ne(x,y) ((x)!=(y))
+#define m_gt(x,y) ((x)>(y))
+#define m_ge(x,y) ((x)>=(y))
+#define m_lt(x,y) ((x)<(y))
+#define m_le(x,y) ((x)<=(y))
+
+#define m_isnan(x) isnan(x)
+#define m_isinf(x) isinf(x)
+#define m_isposinf(x) (isinf(x) && signbit(x)==0)
+#define m_isneginf(x) (isinf(x) && signbit(x))
+#define m_isfinite(x) isfinite(x)
+
+#define m_mulsum_init INT2FIX(0)
+
+#define m_sprintf(s,x) sprintf(s,"%g",x)
+
+#define cmp_prnan(a,b)                         \
+    ((qsort_cast(a)==qsort_cast(b)) ? 0 :      \
+     (qsort_cast(a) > qsort_cast(b)) ? 1 : -1)
+
+#define cmp_ignan(a,b)                                                  \
+    (m_isnan(qsort_cast(a)) ? (m_isnan(qsort_cast(b)) ? 0 : 1) :        \
+     (m_isnan(qsort_cast(b)) ? -1 :                                     \
+      ((qsort_cast(a)==qsort_cast(b)) ? 0 :                             \
+       (qsort_cast(a) > qsort_cast(b)) ? 1 : -1)))
+
+#define cmpgt_prnan(a,b)                        \
+    (qsort_cast(a) > qsort_cast(b))
+
+#define cmpgt_ignan(a,b)                                      \
+    ((m_isnan(qsort_cast(a)) && !m_isnan(qsort_cast(b))) ||   \
+     (qsort_cast(a) > qsort_cast(b)))
+
+#define m_sqrt(x)    sqrt(x)
+#define m_cbrt(x)    cbrt(x)
+#define m_log(x)     log(x)
+#define m_log2(x)    log2(x)
+#define m_log10(x)   log10(x)
+#define m_exp(x)     exp(x)
+#define m_exp2(x)    exp2(x)
+#ifdef HAVE_EXP10
+#define m_exp10(x)   exp10(x)
+#else
+#define m_exp10(x)   pow(10, x)
+#endif
+#define m_expm1(x)   expm1(x)
+#define m_log1p(x)   log1p(x)
+
+#define m_sin(x)     sin(x)
+#define m_cos(x)     cos(x)
+#define m_tan(x)     tan(x)
+#define m_asin(x)    asin(x)
+#define m_acos(x)    acos(x)
+#define m_atan(x)    atan(x)
+#define m_sinh(x)    sinh(x)
+#define m_cosh(x)    cosh(x)
+#define m_tanh(x)    tanh(x)
+#define m_asinh(x)   asinh(x)
+#define m_acosh(x)   acosh(x)
+#define m_atanh(x)   atanh(x)
+#define m_atan2(x,y) atan2(x,y)
+#define m_hypot(x,y) hypot(x,y)
+#define m_sinc(x)    (sin(x)/(x))
+
+#define m_erf(x)     erf(x)
+#define m_erfc(x)    erfc(x)
+#define m_ldexp(x,y) ldexp(x,y)
+#define m_frexp(x,exp) frexp(x,exp)
+
+static inline dtype pow_int(dtype x, int p)
+{
+    dtype r=1;
+    switch(p) {
+    case 0: return 1;
+    case 1: return x;
+    case 2: return x*x;
+    case 3: return x*x*x;
+    case 4: x=x*x; return x*x;
+    }
+    if (p<0)  return 1/pow_int(x,-p);
+    if (p>64) return pow(x,p);
+    while (p) {
+        if (p&1) r *= x;
+        x *= x;
+        p >>= 1;
+    }
+    return r;
+}
+
+static inline dtype f_seq(dtype x, dtype y, double c)
+{
+    return x + y * c;
+}
+
+static inline dtype f_kahan_sum(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x;
+    volatile dtype y=0;
+    volatile dtype t,r=0;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (fabs(x) > fabs(y)) {
+            dtype z=x; x=y; y=z;
+        }
+        r += x;
+        t = y;
+        y += r;
+        t = y-t;
+        r -= t;
+    }
+    return y;
+}
+
+static inline dtype f_kahan_sum_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x;
+    volatile dtype y=0;
+    volatile dtype t,r=0;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(x)) {
+            if (fabs(x) > fabs(y)) {
+                dtype z=x; x=y; y=z;
+            }
+            r += x;
+            t = y;
+            y += r;
+            t = y-t;
+            r -= t;
+        }
+    }
+    return y;
+}
+
+#include "real_accum.h"
diff --git a/ext/numo/narray/numo/types/int16.h b/ext/numo/narray/numo/types/int16.h
new file mode 100644
index 0000000..9342f6a
--- /dev/null
+++ b/ext/numo/narray/numo/types/int16.h
@@ -0,0 +1,21 @@
+typedef int16_t dtype;
+typedef int16_t rtype;
+#define cT  numo_cInt16
+#define cRT cT
+
+#define m_num_to_data(x) ((dtype)NUM2INT(x))
+#define m_data_to_num(x) INT2NUM((int)(x))
+#define m_extract(x)     INT2NUM((int)*(dtype*)(x))
+#define m_sprintf(s,x)   sprintf(s,"%d",(int)(x))
+
+#include "int_macro.h"
+
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+
+#define M_MIN  m_data_to_num(INT16_MIN)
+#define M_MAX  m_data_to_num(INT16_MAX)
diff --git a/ext/numo/narray/numo/types/int32.h b/ext/numo/narray/numo/types/int32.h
new file mode 100644
index 0000000..5d472d2
--- /dev/null
+++ b/ext/numo/narray/numo/types/int32.h
@@ -0,0 +1,21 @@
+typedef int32_t dtype;
+typedef int32_t rtype;
+#define cT  numo_cInt32
+#define cRT cT
+
+#define m_num_to_data(x) ((dtype)NUM2INT32(x))
+#define m_data_to_num(x) INT322NUM((int32_t)(x))
+#define m_extract(x)     INT322NUM((int32_t)*(dtype*)(x))
+#define m_sprintf(s,x)   sprintf(s,"%"PRId32,(int32_t)(x))
+
+#include "int_macro.h"
+
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+
+#define M_MIN  m_data_to_num(INT32_MIN)
+#define M_MAX  m_data_to_num(INT32_MAX)
diff --git a/ext/numo/narray/numo/types/int64.h b/ext/numo/narray/numo/types/int64.h
new file mode 100644
index 0000000..bfb9426
--- /dev/null
+++ b/ext/numo/narray/numo/types/int64.h
@@ -0,0 +1,21 @@
+typedef int64_t dtype;
+typedef int64_t rtype;
+#define cT  numo_cInt64
+#define cRT cT
+
+#define m_num_to_data(x) ((dtype)NUM2INT64(x))
+#define m_data_to_num(x) INT642NUM((int64_t)(x))
+#define m_extract(x)     INT642NUM((int64_t)*(dtype*)(x))
+#define m_sprintf(s,x)   sprintf(s,"%"PRId64,(int64_t)(x))
+
+#include "int_macro.h"
+
+#ifndef INT64_MIN
+#define INT64_MIN (-9223372036854775807l-1)
+#endif
+#ifndef INT64_MAX
+#define INT64_MAX (9223372036854775807l)
+#endif
+
+#define M_MIN  m_data_to_num(INT64_MIN)
+#define M_MAX  m_data_to_num(INT64_MAX)
diff --git a/ext/numo/narray/numo/types/int8.h b/ext/numo/narray/numo/types/int8.h
new file mode 100644
index 0000000..676d5e9
--- /dev/null
+++ b/ext/numo/narray/numo/types/int8.h
@@ -0,0 +1,21 @@
+typedef int8_t dtype;
+typedef int8_t rtype;
+#define cT  numo_cInt8
+#define cRT cT
+
+#define m_num_to_data(x) ((dtype)NUM2INT(x))
+#define m_data_to_num(x) INT2NUM((int)(x))
+#define m_extract(x)     INT2NUM((int)*(dtype*)(x))
+#define m_sprintf(s,x)   sprintf(s,"%d",(int)(x))
+
+#include "int_macro.h"
+
+#ifndef INT8_MIN
+#define INT8_MIN (-127-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+
+#define M_MIN  INT2FIX(INT8_MIN)
+#define M_MAX  INT2FIX(INT8_MAX)
diff --git a/ext/numo/narray/numo/types/int_macro.h b/ext/numo/narray/numo/types/int_macro.h
new file mode 100644
index 0000000..d795426
--- /dev/null
+++ b/ext/numo/narray/numo/types/int_macro.h
@@ -0,0 +1,35 @@
+#include "xint_macro.h"
+
+#define m_abs(x)     ((x<0)?-x:x)
+#define m_sign(x)    (((x)==0) ? 0 : (((x)>0) ? 1 : -1))
+
+static inline dtype int_reciprocal(dtype x) {
+    switch (x) {
+    case 1:
+        return 1;
+    case -1:
+        return -1;
+    case 0:
+        rb_raise(rb_eZeroDivError, "divided by 0");
+    default:
+        return 0;
+    }
+}
+
+static dtype pow_int(dtype x, int p)
+{
+    dtype r = m_one;
+    switch(p) {
+    case 0: return 1;
+    case 1: return x;
+    case 2: return x*x;
+    case 3: return x*x*x;
+    }
+    if (p<0) return 0;
+    while (p) {
+        if (p&1) r *= x;
+        x *= x;
+        p >>= 1;
+    }
+    return r;
+}
diff --git a/ext/numo/narray/numo/types/real_accum.h b/ext/numo/narray/numo/types/real_accum.h
new file mode 100644
index 0000000..d0f68f6
--- /dev/null
+++ b/ext/numo/narray/numo/types/real_accum.h
@@ -0,0 +1,440 @@
+
+#define m_mulsum(x,y,z) {z = m_add(m_mul(x,y),z);}
+#define m_mulsum_nan(x,y,z) {            \
+        if(!m_isnan(x) && !m_isnan(y)) { \
+            z = m_add(m_mul(x,y),z);     \
+        }}
+
+#define m_cumsum(x,y) {(x)=m_add(x,y);}
+#define m_cumsum_nan(x,y) {       \
+        if (m_isnan(x)) {         \
+            (x) = (y);            \
+        } else if (!m_isnan(y)) { \
+            (x) = m_add(x,y);     \
+        }}
+
+#define m_cumprod(x,y) {(x)=m_mul(x,y);}
+#define m_cumprod_nan(x,y) {      \
+        if (m_isnan(x)) {         \
+            (x) = (y);            \
+        } else if (!m_isnan(y)) { \
+            (x) = m_mul(x,y);     \
+        }}
+
+static inline dtype f_sum(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x,y=m_zero;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        y = m_add(x,y);
+    }
+    return y;
+}
+
+static inline dtype f_sum_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x,y=m_zero;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(x)) {
+            y = m_add(x,y);
+        }
+    }
+    return y;
+}
+
+
+static inline dtype f_prod(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x,y=m_one;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        y = m_mul(x,y);
+    }
+    return y;
+}
+
+static inline dtype f_prod_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    dtype x,y=m_one;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(x)) {
+            y = m_mul(x,y);
+        }
+    }
+    return y;
+}
+
+static inline dtype f_mean(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,y=m_zero;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        y = m_add(x,y);
+        count++;
+    }
+    return m_div(y,m_from_real(count));
+}
+
+static inline dtype f_mean_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,y=m_zero;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(x)) {
+            y = m_add(x,y);
+            count++;
+        }
+    }
+    return m_div(y,m_from_real(count));
+}
+
+static inline dtype f_var(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,y=m_zero;
+    dtype a,m;
+
+    m = f_mean(n,p,stride);
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        a = m_abs(m_sub(x,m));
+        y = m_add(y,m_square(a));
+        count++;
+    }
+    return m_div(y,m_from_real(count-1));
+}
+
+static inline dtype f_var_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,y=m_zero;
+    dtype a,m;
+
+    m = f_mean_nan(n,p,stride);
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(x)) {
+            a = m_abs(m_sub(x,m));
+            y = m_add(y,m_square(a));
+            count++;
+        }
+    }
+    return m_div(y,m_from_real(count-1));
+}
+
+static inline dtype f_stddev(size_t n, char *p, ssize_t stride)
+{
+    return m_sqrt(f_var(n,p,stride));
+}
+
+static inline dtype f_stddev_nan(size_t n, char *p, ssize_t stride)
+{
+    return m_sqrt(f_var_nan(n,p,stride));
+}
+
+static inline dtype f_rms(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,y=m_zero;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        y = m_add(y,m_square(m_abs(x)));
+        count++;
+    }
+    return m_sqrt(m_div(y,m_from_real(count)));
+}
+
+static inline dtype f_rms_nan(size_t n, char *p, ssize_t stride)
+{
+    size_t i=n;
+    size_t count=0;
+    dtype x,y=m_zero;
+
+    for (; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(x)) {
+            y = m_add(y,m_square(m_abs(x)));
+            count++;
+        }
+    }
+    return m_sqrt(m_div(y,m_from_real(count)));
+}
+
+// ---------------------------------------------------------
+
+static inline dtype f_min_nan(size_t n, char *p, ssize_t stride)
+{
+    dtype x,y;
+    size_t i=n;
+
+    y = *(dtype*)p;
+    p += stride;
+    if (m_isnan(y)) {return y;}
+    for (i--; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (m_isnan(x)) {return x;}
+        if (m_lt(x,y)) {
+            y = x;
+        }
+    }
+    return y;
+}
+
+static inline dtype f_min(size_t n, char *p, ssize_t stride)
+{
+    dtype x,y=m_zero;
+    size_t i=n;
+
+    for (; i--; ) {
+        y = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(y)) {
+            for (; i--;) {
+                x = *(dtype*)p;
+                p += stride;
+                if (m_lt(x,y)) {
+                    y = x;
+                }
+            }
+            break;
+        }
+    }
+    return y;
+}
+
+static inline dtype f_max_nan(size_t n, char *p, ssize_t stride)
+{
+    dtype x,y;
+    size_t i=n;
+
+    y = *(dtype*)p;
+    p += stride;
+    if (m_isnan(y)) {return y;}
+    for (i--; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (m_isnan(x)) {return x;}
+        if (m_gt(x,y)) {
+            y = x;
+        }
+    }
+    return y;
+}
+
+static inline dtype f_max(size_t n, char *p, ssize_t stride)
+{
+    dtype x,y=m_zero;
+    size_t i=n;
+
+    for (; i--; ) {
+        y = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(y)) {
+            for (; i--;) {
+                x = *(dtype*)p;
+                p += stride;
+                if (m_gt(x,y)) {
+                    y = x;
+                }
+            }
+            break;
+        }
+    }
+    return y;
+}
+
+static inline size_t f_min_index_nan(size_t n, char *p, ssize_t stride)
+{
+    dtype x, y;
+    size_t i, j=0;
+
+    y = *(dtype*)p;
+    p += stride;
+    if (m_isnan(y)) {return j;}
+    for (i=1; i<n; i++) {
+        x = *(dtype*)p;
+        p += stride;
+        if (m_isnan(x)) {return i;}
+        if (m_lt(x,y)) {
+            y = x;
+            j = i;
+        }
+    }
+    return j;
+}
+
+static inline size_t f_min_index(size_t n, char *p, ssize_t stride)
+{
+    dtype x, y;
+    size_t i, j=0;
+
+    for (i=0; i<n; i++) {
+        y = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(y)) {
+            j = i;
+            for (; i<n; i++) {
+                x = *(dtype*)p;
+                p += stride;
+                if (m_lt(x,y)) {
+                    y = x;
+                    j = i;
+                }
+            }
+            break;
+        }
+    }
+    return j;
+}
+
+static inline size_t f_max_index_nan(size_t n, char *p, ssize_t stride)
+{
+    dtype x, y;
+    size_t i, j=0;
+
+    y = *(dtype*)p;
+    p += stride;
+    if (m_isnan(y)) {return j;}
+    for (i=1; i<n; i++) {
+        x = *(dtype*)p;
+        p += stride;
+        if (m_isnan(x)) {return i;}
+        if (m_gt(x,y)) {
+            y = x;
+            j = i;
+        }
+    }
+    return j;
+}
+
+static inline size_t f_max_index(size_t n, char *p, ssize_t stride)
+{
+    dtype x, y;
+    size_t i, j=0;
+
+    for (i=0; i<n; i++) {
+        y = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(y)) {
+            j = i;
+            for (; i<n; i++) {
+                x = *(dtype*)p;
+                p += stride;
+                if (m_gt(x,y)) {
+                    y = x;
+                    j = i;
+                }
+            }
+            break;
+        }
+    }
+    return j;
+}
+
+static inline void
+f_minmax_nan(size_t n, char *p, ssize_t stride, dtype *amin, dtype *amax)
+{
+    dtype x,min,max;
+    size_t i=n;
+
+    min = max = *(dtype*)p;
+    p += stride;
+    if (m_isnan(min)) {
+        *amin = *amax = min;
+        return;
+    }
+    for (i--; i--;) {
+        x = *(dtype*)p;
+        p += stride;
+        if (m_isnan(x)) {
+            *amin = *amax = x;
+            return;
+        }
+        if (m_lt(x,min)) {
+            min = x;
+        }
+        if (m_gt(x,max)) {
+            max = x;
+        }
+    }
+    *amin = min;
+    *amax = max;
+    return;
+}
+
+static inline dtype f_ptp_nan(size_t n, char *p, ssize_t stride)
+{
+    dtype min,max;
+    f_minmax_nan(n,p,stride,&min,&max);
+    return m_sub(max,min);
+}
+
+static inline void
+f_minmax(size_t n, char *p, ssize_t stride, dtype *amin, dtype *amax)
+{
+    dtype x,min,max;
+    size_t i=n;
+
+    min = max = m_zero;
+    for (; i--; ) {
+        min = *(dtype*)p;
+        p += stride;
+        if (!m_isnan(min)) {
+            max = min;
+            for (; i--;) {
+                x = *(dtype*)p;
+                p += stride;
+                if (m_lt(x,min)) {
+                    min = x;
+                }
+                if (m_gt(x,max)) {
+                    max = x;
+                }
+            }
+            break;
+        }
+    }
+    *amin = min;
+    *amax = max;
+    return;
+}
+
+static inline dtype f_ptp(size_t n, char *p, ssize_t stride)
+{
+    dtype min,max;
+    f_minmax(n,p,stride,&min,&max);
+    return m_sub(max,min);
+}
diff --git a/ext/numo/narray/numo/types/robj_macro.h b/ext/numo/narray/numo/types/robj_macro.h
new file mode 100644
index 0000000..3ed73a7
--- /dev/null
+++ b/ext/numo/narray/numo/types/robj_macro.h
@@ -0,0 +1,75 @@
+#define m_zero INT2FIX(0)
+#define m_one  INT2FIX(1)
+
+#define m_num_to_data(x) (x)
+#define m_data_to_num(x) (x)
+
+#define m_from_double(x) rb_float_new(x)
+#define m_from_real(x)   rb_float_new(x)
+
+#define m_add(x,y)     rb_funcall(x,'+',1,y)
+#define m_sub(x,y)     rb_funcall(x,'-',1,y)
+#define m_mul(x,y)     rb_funcall(x,'*',1,y)
+#define m_div(x,y)     rb_funcall(x,'/',1,y)
+#define m_mod(x,y)     rb_funcall(x,'%',1,y)
+#define m_divmod(x,y,a,b)                               \
+    {x = rb_funcall(x,id_divmod,1,y);                   \
+     a = RARRAY_PTR(x)[0]; b = RARRAY_PTR(x)[0];}
+#define m_pow(x,y)     rb_funcall(x,id_pow,1,y)
+#define m_pow_int(x,y) rb_funcall(x,id_pow,1,y)
+
+#define m_abs(x)       rb_funcall(x,id_abs,0)
+#define m_minus(x)     rb_funcall(x,id_minus,0)
+#define m_reciprocal(x)   rb_funcall(x,id_reciprocal,0)
+#define m_square(x)    rb_funcall(x,'*',1,x)
+#define m_floor(x)     rb_funcall(x,id_floor,0)
+#define m_round(x)     rb_funcall(x,id_round,0)
+#define m_ceil(x)      rb_funcall(x,id_ceil,0)
+#define m_trunc(x)     rb_funcall(x,id_truncate,0)
+#define m_sign(x)      rb_funcall(x,id_ufo,1,INT2FIX(0))
+
+#define m_eq(x,y)      RTEST(rb_funcall(x,id_eq,1,y))
+#define m_ne(x,y)      RTEST(rb_funcall(x,id_ne,1,y))
+#define m_gt(x,y)      RTEST(rb_funcall(x,id_gt,1,y))
+#define m_ge(x,y)      RTEST(rb_funcall(x,id_ge,1,y))
+#define m_lt(x,y)      RTEST(rb_funcall(x,id_lt,1,y))
+#define m_le(x,y)      RTEST(rb_funcall(x,id_le,1,y))
+
+#define m_bit_and(x,y) rb_funcall(x,id_bit_and,1,y)
+#define m_bit_or(x,y)  rb_funcall(x,id_bit_or, 1,y)
+#define m_bit_xor(x,y) rb_funcall(x,id_bit_xor,1,y)
+#define m_bit_not(x)   rb_funcall(x,id_bit_not,0)
+
+#define m_left_shift(x,y) rb_funcall(x,id_left_shift,1,y)
+#define m_right_shift(x,y) rb_funcall(x,id_right_shift,1,y)
+
+#define m_isnan(x)     ((rb_respond_to(x,id_nan_p)) ? RTEST(rb_funcall(x,id_nan_p,0)) : 0)
+#define m_isinf(x)     ((rb_respond_to(x,id_infinite_p)) ? RTEST(rb_funcall(x,id_infinite_p,0)) : 0)
+#define m_isposinf(x)  ((rb_respond_to(x,id_infinite_p)) ?        \
+                        ((RTEST(rb_funcall(x,id_infinite_p,0))) ? \
+                         m_gt(x,INT2FIX(0)) : 0) : 0)
+#define m_isneginf(x)  ((rb_respond_to(x,id_infinite_p)) ?              \
+                        ((RTEST(rb_funcall(x,id_infinite_p,0))) ?       \
+                         m_lt(x,INT2FIX(0)) : 0) : 0)
+#define m_isfinite(x)  ((rb_respond_to(x,id_finite_p)) ? RTEST(rb_funcall(x,id_finite_p,0)) : 0)
+
+#define m_mulsum_init INT2FIX(0)
+
+#define m_sprintf(s,x) robj_sprintf(s,x)
+
+static inline int robj_sprintf(char *s, VALUE x) {
+    VALUE v = rb_funcall(x,rb_intern("to_s"),0);
+    return sprintf(s,"%s",StringValuePtr(v));
+}
+
+#define m_sqrt(x)                                          \
+    rb_funcall(rb_const_get(rb_mKernel,rb_intern("Math")), \
+               rb_intern("sqrt"),1,x);
+
+static inline dtype f_seq(dtype x, dtype y, size_t c)
+{
+    y = m_mul(y,SIZET2NUM(c));
+    return m_add(x,y);
+}
+
+#include "real_accum.h"
diff --git a/ext/numo/narray/numo/types/robject.h b/ext/numo/narray/numo/types/robject.h
new file mode 100644
index 0000000..8a81d6d
--- /dev/null
+++ b/ext/numo/narray/numo/types/robject.h
@@ -0,0 +1,27 @@
+typedef VALUE dtype;
+typedef VALUE rtype;
+#define cT  numo_cRObject
+#define cRT cT
+//#define mTM mRObjectMath
+
+#include "float_def.h"
+#include "robj_macro.h"
+
+#define m_min_init (0.0/0.0)
+#define m_max_init (0.0/0.0)
+#define m_extract(x) (*(VALUE*)x)
+#define m_nearly_eq(x,y) robj_nearly_eq(x,y)
+
+inline static int robj_nearly_eq(VALUE vx, VALUE vy)
+{
+    double x, y;
+    x = NUM2DBL(vx);
+    y = NUM2DBL(vy);
+    return (fabs(x-y)<=(fabs(x)+fabs(y))*DBL_EPSILON*2);
+}
+
+/* generates a random number on [0,1)-real-interval */
+inline static dtype m_rand(dtype max)
+{
+    return DBL2NUM(genrand_res53_mix() * max);
+}
diff --git a/ext/numo/narray/numo/types/scomplex.h b/ext/numo/narray/numo/types/scomplex.h
new file mode 100644
index 0000000..e52db3a
--- /dev/null
+++ b/ext/numo/narray/numo/types/scomplex.h
@@ -0,0 +1,44 @@
+typedef scomplex dtype;
+typedef float rtype;
+#define cT  numo_cSComplex
+#define cRT numo_cSFloat
+#define mTM numo_mSComplexMath
+
+#include "complex_macro.h"
+
+static inline bool c_nearly_eq(dtype x, dtype y) {
+    return c_abs(c_sub(x,y)) <= (c_abs(x)+c_abs(y))*FLT_EPSILON*2;
+}
+
+#ifdef SFMT_H
+/* generates a random number on [0,1)-real-interval */
+inline static dtype m_rand(dtype max)
+{
+    dtype z;
+    REAL(z) = to_real2(gen_rand32()) * REAL(max);
+    IMAG(z) = to_real2(gen_rand32()) * IMAG(max);
+    return z;
+}
+
+/* generates random numbers from the normal distribution
+   using Box-Muller Transformation.
+ */
+inline static void m_rand_norm(dtype mu, rtype sigma, dtype *a0)
+{
+    rtype x1, x2, w;
+    do {
+	x1 = to_real2(gen_rand32());
+	x1 = x1*2-1;
+	x2 = to_real2(gen_rand32());
+	x2 = x2*2-1;
+	w = x1 * x1 + x2 * x2;
+    } while (w>=1);
+    w = sqrt( (-2*log(w)) / w );
+    REAL(*a0) = x1*w * sigma + REAL(mu);
+    IMAG(*a0) = x2*w * sigma + IMAG(mu);
+}
+#endif
+
+#define M_EPSILON rb_float_new(1.1920928955078125e-07)
+#define M_MIN     rb_float_new(1.1754943508222875e-38)
+#define M_MAX     rb_float_new(3.4028234663852886e+38)
diff --git a/ext/numo/narray/numo/types/sfloat.h b/ext/numo/narray/numo/types/sfloat.h
new file mode 100644
index 0000000..e516bce
--- /dev/null
+++ b/ext/numo/narray/numo/types/sfloat.h
@@ -0,0 +1,43 @@
+typedef float dtype;
+typedef float rtype;
+#define cT  numo_cSFloat
+#define cRT numo_cSFloat
+#define mTM numo_mSFloatMath
+
+#include "float_macro.h"
+
+#ifdef SFMT_H
+/* generates a random number on [0,1)-real-interval */
+inline static dtype m_rand(dtype max)
+{
+    return to_real2(gen_rand32()) * max;
+}
+
+/* generates random numbers from the normal distribution
+   using Box-Muller Transformation.
+ */
+inline static void m_rand_norm(dtype mu, dtype sigma, dtype *a0, dtype *a1)
+{
+    dtype x1, x2, w;
+    do {
+	x1 = to_real2(gen_rand32());
+	x1 = x1*2-1;
+	x2 = to_real2(gen_rand32());
+	x2 = x2*2-1;
+	w = x1 * x1 + x2 * x2;
+    } while (w>=1);
+    w = sqrt( (-2*log(w)) / w );
+    if (a0) {*a0 = x1*w * sigma + mu;}
+    if (a1) {*a1 = x2*w * sigma + mu;}
+}
+#endif
+
+#define m_min_init numo_sfloat_new_dim0(0.0/0.0)
+#define m_max_init numo_sfloat_new_dim0(0.0/0.0)
+
+#define m_extract(x) rb_float_new(*(float*)x)
+#define m_nearly_eq(x,y) (fabs(x-y)<=(fabs(x)+fabs(y))*FLT_EPSILON*2)
+
+#define M_EPSILON rb_float_new(1.1920928955078125e-07)
+#define M_MIN     rb_float_new(1.1754943508222875e-38)
+#define M_MAX     rb_float_new(3.4028234663852886e+38)
diff --git a/ext/numo/narray/numo/types/uint16.h b/ext/numo/narray/numo/types/uint16.h
new file mode 100644
index 0000000..880c861
--- /dev/null
+++ b/ext/numo/narray/numo/types/uint16.h
@@ -0,0 +1,18 @@
+typedef u_int16_t dtype;
+typedef u_int16_t rtype;
+#define cT  numo_cUInt16
+#define cRT cT
+
+#define m_num_to_data(x) ((dtype)NUM2UINT(x))
+#define m_data_to_num(x) UINT2NUM((unsigned int)(x))
+#define m_extract(x)     UINT2NUM((unsigned int)*(dtype*)(x))
+#define m_sprintf(s,x)   sprintf(s,"%u",(unsigned int)(x))
+
+#include "uint_macro.h"
+
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535)
+#endif
+
+#define M_MIN  INT2FIX(0)
+#define M_MAX  m_data_to_num(UINT16_MAX)
diff --git a/ext/numo/narray/numo/types/uint32.h b/ext/numo/narray/numo/types/uint32.h
new file mode 100644
index 0000000..8435271
--- /dev/null
+++ b/ext/numo/narray/numo/types/uint32.h
@@ -0,0 +1,18 @@
+typedef u_int32_t dtype;
+typedef u_int32_t rtype;
+#define cT  numo_cUInt32
+#define cRT cT
+
+#define m_num_to_data(x) ((dtype)NUM2UINT32(x))
+#define m_data_to_num(x) UINT322NUM((u_int32_t)(x))
+#define m_extract(x)     UINT322NUM((u_int32_t)*(dtype*)(x))
+#define m_sprintf(s,x)   sprintf(s,"%"PRIu32,(u_int32_t)(x))
+
+#include "uint_macro.h"
+
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295u)
+#endif
+
+#define M_MIN  INT2FIX(0)
+#define M_MAX  m_data_to_num(UINT32_MAX)
diff --git a/ext/numo/narray/numo/types/uint64.h b/ext/numo/narray/numo/types/uint64.h
new file mode 100644
index 0000000..0ad200e
--- /dev/null
+++ b/ext/numo/narray/numo/types/uint64.h
@@ -0,0 +1,18 @@
+typedef u_int64_t dtype;
+typedef u_int64_t rtype;
+#define cT  numo_cUInt64
+#define cRT cT
+
+#define m_num_to_data(x) ((dtype)NUM2UINT64(x))
+#define m_data_to_num(x) UINT642NUM((u_int64_t)(x))
+#define m_extract(x)     UINT642NUM((u_int64_t)*(dtype*)(x))
+#define m_sprintf(s,x)   sprintf(s,"%"PRIu64,(u_int64_t)(x))
+
+#include "uint_macro.h"
+
+#ifndef UINT64_MAX
+#define UINT64_MAX (18446744073709551615ul)
+#endif
+
+#define M_MIN  INT2FIX(0)
+#define M_MAX  m_data_to_num(UINT64_MAX)
diff --git a/ext/numo/narray/numo/types/uint8.h b/ext/numo/narray/numo/types/uint8.h
new file mode 100644
index 0000000..4fe24e5
--- /dev/null
+++ b/ext/numo/narray/numo/types/uint8.h
@@ -0,0 +1,18 @@
+typedef u_int8_t dtype;
+typedef u_int8_t rtype;
+#define cT  numo_cUInt8
+#define cRT cT
+
+#define m_num_to_data(x) ((dtype)NUM2UINT(x))
+#define m_data_to_num(x) UINT2NUM((unsigned int)(x))
+#define m_extract(x)     UINT2NUM((unsigned int)*(dtype*)(x))
+#define m_sprintf(s,x)   sprintf(s,"%u",(unsigned int)(x))
+
+#include "uint_macro.h"
+
+#ifndef UINT8_MAX
+#define UINT8_MAX (255)
+#endif
+
+#define M_MIN  INT2FIX(0)
+#define M_MAX  m_data_to_num(UINT8_MAX)
diff --git a/ext/numo/narray/numo/types/uint_macro.h b/ext/numo/narray/numo/types/uint_macro.h
new file mode 100644
index 0000000..51639ba
--- /dev/null
+++ b/ext/numo/narray/numo/types/uint_macro.h
@@ -0,0 +1,32 @@
+#include "xint_macro.h"
+
+#define m_abs(x)     (x)
+#define m_sign(x)    (((x)==0) ? 0:1)
+
+static inline dtype int_reciprocal(dtype x) {
+    switch (x) {
+    case 1:
+        return 1;
+    case 0:
+        rb_raise(rb_eZeroDivError, "divided by 0");
+    default:
+        return 0;
+    }
+}
+
+static dtype pow_int(dtype x, int p)
+{
+    dtype r = m_one;
+    switch(p) {
+    case 0: return 1;
+    case 1: return x;
+    case 2: return x*x;
+    case 3: return x*x*x;
+    }
+    while (p) {
+        if (p&1) r *= x;
+        x *= x;
+        p >>= 1;
+    }
+    return r;
+}
diff --git a/ext/numo/narray/numo/types/xint_macro.h b/ext/numo/narray/numo/types/xint_macro.h
new file mode 100644
index 0000000..89fe0ab
--- /dev/null
+++ b/ext/numo/narray/numo/types/xint_macro.h
@@ -0,0 +1,173 @@
+#define m_zero 0
+#define m_one  1
+
+#define m_from_double(x) (x)
+#define m_from_real(x) (x)
+
+#define m_add(x,y) ((x)+(y))
+#define m_sub(x,y) ((x)-(y))
+#define m_mul(x,y) ((x)*(y))
+#define m_div(x,y) ((x)/(y))
+#define m_mod(x,y) ((x)%(y))
+#define m_divmod(x,y,a,b) {a=(x)/(y); b=m_mod(x,y);}
+#define m_pow(x,y) pow_int(x,y)
+#define m_pow_int(x,y) pow_int(x,y)
+
+#define m_bit_and(x,y) ((x)&(y))
+#define m_bit_or(x,y)  ((x)|(y))
+#define m_bit_xor(x,y) ((x)^(y))
+#define m_bit_not(x)   (~(x))
+
+#define m_minus(x)   (-(x))
+#define m_reciprocal(x) int_reciprocal(x)
+#define m_square(x)  ((x)*(x))
+
+#define m_eq(x,y) ((x)==(y))
+#define m_ne(x,y) ((x)!=(y))
+#define m_gt(x,y) ((x)>(y))
+#define m_ge(x,y) ((x)>=(y))
+#define m_lt(x,y) ((x)<(y))
+#define m_le(x,y) ((x)<=(y))
+#define m_left_shift(x,y) ((x)<<(y))
+#define m_right_shift(x,y) ((x)>>(y))
+
+#define m_isnan(x) 0
+
+#define m_mulsum(x,y,z) {z += x*y;}
+#define m_mulsum_init INT2FIX(0)
+#define m_cumsum(x,y) {x += y;}
+#define m_cumprod(x,y) {x *= y;}
+
+#define cmp(a,b)                                        \
+    ((qsort_cast(a)==qsort_cast(b)) ? 0 :               \
+     (qsort_cast(a) > qsort_cast(b)) ? 1 : -1)
+#define cmpgt(a,b)                              \
+    (qsort_cast(a) > qsort_cast(b))
+
+
+static inline dtype f_sum(size_t n, char *p, ssize_t stride)
+{
+    dtype x,y=0;
+    size_t i=n;
+    for (; i--;) {
+        x = *(dtype*)p;
+        y += x;
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_prod(size_t n, char *p, ssize_t stride)
+{
+    dtype x,y=1;
+    size_t i=n;
+    for (; i--;) {
+        x = *(dtype*)p;
+        y *= x;
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_min(size_t n, char *p, ssize_t stride)
+{
+    dtype x,y;
+    size_t i=n;
+
+    y = *(dtype*)p;
+    p += stride;
+    i--;
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (x < y) {
+            y = x;
+        }
+        p += stride;
+    }
+    return y;
+}
+
+static inline dtype f_max(size_t n, char *p, ssize_t stride)
+{
+    dtype x,y;
+    size_t i=n;
+
+    y = *(dtype*)p;
+    p += stride;
+    i--;
+    for (; i--;) {
+        x = *(dtype*)p;
+        if (x > y) {
+            y = x;
+        }
+        p += stride;
+    }
+    return y;
+}
+
+static inline size_t f_min_index(size_t n, char *p, ssize_t stride)
+{
+    dtype x, y;
+    size_t i, j=0;
+
+    y = *(dtype*)p;
+    for (i=1; i<n; i++) {
+        x = *(dtype*)(p+i*stride);
+        if (x < y) {
+            y = x;
+            j = i;
+        }
+    }
+    return j;
+}
+
+static inline size_t f_max_index(size_t n, char *p, ssize_t stride)
+{
+    dtype x, y;
+    size_t i, j=0;
+
+    y = *(dtype*)p;
+    for (i=1; i<n; i++) {
+        x = *(dtype*)(p+i*stride);
+        if (x > y) {
+            y = x;
+            j = i;
+        }
+    }
+    return j;
+}
+
+static inline void
+f_minmax(size_t n, char *p, ssize_t stride, dtype* amin, dtype* amax)
+{
+    dtype x,min,max;
+    size_t i=n;
+
+    min = max = *(dtype*)p;
+    p += stride;
+    for (i--; i--;) {
+        x = *(dtype*)p;
+        if (m_gt(x,max)) {
+            max = x;
+        }
+        if (m_lt(x,min)) {
+            min = x;
+        }
+        p += stride;
+    }
+    *amin = min;
+    *amax = max;
+    return;
+}
+
+static inline dtype f_ptp(size_t n, char *p, ssize_t stride)
+{
+    dtype min,max;
+    f_minmax(n,p,stride,&min,&max);
+    return m_sub(max,min);
+}
+
+static inline double f_seq(double x, double y, double c)
+{
+    return x + y * c;
+}
diff --git a/ext/numo/narray/rand.c b/ext/numo/narray/rand.c
new file mode 100644
index 0000000..97542cb
--- /dev/null
+++ b/ext/numo/narray/rand.c
@@ -0,0 +1,72 @@
+#include "ruby.h"
+#include "numo/narray.h"
+#include "SFMT.h"
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <time.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+int n_bits(u_int64_t a)
+{
+    int i, x, /*xu,*/ xl, n=5;
+    u_int64_t m;
+
+    if (a==0) return 0;
+    //if (a<0) a=-a;
+
+    x  = 1<<n;
+    //xu = 1<<(n+1);
+    xl = 0;
+    //printf("%3i, [%3i, %3i], %i\n", i, xu, xl, x);
+
+    for (i=n; i>=0; i--) {
+	m = ~((1<<(x-1))-1);
+	if (m & a) {
+	    xl = x;
+	    x += 1<<(i-1);
+	} else {
+	    //xu = x;
+	    x -= 1<<(i-1);
+	}
+	//printf("%3i, [%3i, %3i], %i, 0x%lx, 0x%lx\n", i, xu, xl, x, m, m&a);
+    }
+    return xl;
+}
+
+static u_int64_t
+ random_seed()
+{
+    static int n = 0;
+    struct timeval tv;
+
+    gettimeofday(&tv, 0);
+    return tv.tv_sec ^ tv.tv_usec ^ getpid() ^ n++;
+}
+
+static VALUE
+ nary_s_srand(int argc, VALUE *argv, VALUE obj)
+{
+    VALUE vseed;
+    u_int64_t seed;
+
+    //rb_secure(4);
+    if (rb_scan_args(argc, argv, "01", &vseed) == 0) {
+        seed = random_seed();
+    }
+    else {
+        seed = NUM2UINT64(vseed);
+    }
+    init_gen_rand(seed);
+
+    return Qnil;
+}
+
+void
+Init_nary_rand() {
+    rb_define_singleton_method(cNArray, "srand", nary_s_srand, -1);
+    init_gen_rand(0);
+}
diff --git a/ext/numo/narray/step.c b/ext/numo/narray/step.c
new file mode 100644
index 0000000..a047765
--- /dev/null
+++ b/ext/numo/narray/step.c
@@ -0,0 +1,501 @@
+/*
+  step.c
+  Numerical Array Extension for Ruby
+    (C) Copyright 2007,2013 by Masahiro TANAKA
+*/
+#include <ruby.h>
+#include <math.h>
+
+#include "numo/narray.h"
+
+#if defined(__FreeBSD__) && __FreeBSD__ < 4
+#include <floatingpoint.h>
+#endif
+
+#ifdef HAVE_FLOAT_H
+#include <float.h>
+#endif
+
+#ifdef HAVE_IEEEFP_H
+#include <ieeefp.h>
+#endif
+
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.2204460492503131e-16
+#endif
+
+static ID id_beg, id_end, id_len, id_step, id_excl;
+
+//#define EXCL(r) RTEST(rb_ivar_get((r), id_excl))
+#define EXCL(r) RTEST(rb_funcall((r), rb_intern("exclude_end?"), 0))
+
+#define SET_EXCL(r,v) rb_ivar_set((r), id_excl, (v) ? Qtrue : Qfalse)
+
+static void
+step_init(
+  VALUE self,
+  VALUE beg,
+  VALUE end,
+  VALUE step,
+  VALUE len,
+  VALUE excl
+)
+{
+    if (RTEST(len)) {
+        if (!(FIXNUM_P(len) || TYPE(len)==T_BIGNUM)) {
+            rb_raise(rb_eArgError, "length must be Integer");
+        }
+        if (RTEST(rb_funcall(len,rb_intern("<"),1,INT2FIX(0)))) {
+            rb_raise(rb_eRangeError,"length must be non negative");
+        }
+    }
+    rb_ivar_set(self, id_beg, beg);
+    rb_ivar_set(self, id_end, end);
+    rb_ivar_set(self, id_len, len);
+    rb_ivar_set(self, id_step, step);
+    SET_EXCL(self, excl);
+}
+
+VALUE
+nary_step_new(
+  VALUE beg,
+  VALUE end,
+  VALUE step,
+  VALUE len,
+  VALUE excl
+)
+{
+    VALUE self = rb_obj_alloc(na_cStep);
+
+    step_init(self, beg, end, step, len, excl);
+    return self;
+}
+
+VALUE
+nary_step_new2(
+  VALUE range,
+  VALUE step,
+  VALUE len
+)
+{
+    VALUE beg, end, excl;
+    VALUE self = rb_obj_alloc(na_cStep);
+
+    //beg = rb_ivar_get(range, id_beg);
+    beg = rb_funcall(range, id_beg, 0);
+    //end = rb_ivar_get(range, id_end);
+    end = rb_funcall(range, id_end, 0);
+    excl = rb_funcall(range, rb_intern("exclude_end?"), 0);
+
+    step_init(self, beg, end, step, len, excl);
+    return self;
+}
+
+
+/*
+ *  call-seq:
+ *     Step.new(start, end, step=nil, length=nil)    => step
+ *     Step.new(range, step=nil, length=nil)         => step
+ *
+ *  Constructs a step using three parameters among <i>start</i>,
+ *  <i>end</i>, <i>step</i> and <i>length</i>.  <i>start</i>,
+ *  <i>end</i> parameters can be replaced with <i>range</i>.  If the
+ *  <i>step</i> is omitted (or supplied with nil), then calculated
+ *  from <i>length</i> or definded as 1.
+ */
+
+static VALUE
+step_initialize( int argc, VALUE *argv, VALUE self )
+{
+    VALUE a, b=Qnil, c=Qnil, d=Qnil, e=Qnil;
+
+    rb_scan_args(argc, argv, "13", &a, &b, &c, &d);
+    /* Selfs are immutable, so that they should be initialized only once. */
+    if (rb_ivar_defined(self, id_beg)) {
+        rb_name_error(rb_intern("initialize"), "`initialize' called twice");
+    }
+    if (rb_obj_is_kind_of(a,rb_cRange)) {
+        if (argc>3) {
+            rb_raise(rb_eArgError, "extra argument");
+        }
+        d = c;
+        c = b;
+        e = rb_funcall(a, rb_intern("exclude_end?"), 0);
+        //b = rb_ivar_get(a, id_end);
+        b = rb_funcall(a, id_end, 0);
+        //a = rb_ivar_get(a, id_beg);
+        a = rb_funcall(a, id_beg, 0);
+    }
+    step_init(self, a, b, c, d, e);
+    return Qnil;
+}
+
+/*
+ *  call-seq:
+ *     step.begin  => obj
+ *     step.first  => obj
+ *
+ *  Returns the start of <i>step</i>.
+ */
+
+static VALUE
+step_first( VALUE self )
+{
+    return rb_ivar_get(self, id_beg);
+}
+
+/*
+ *  call-seq:
+ *     step.end    => obj
+ *     step.last   => obj
+ *
+ *  Returns the object that defines the end of <i>step</i>.
+ */
+
+static VALUE
+step_last( VALUE self )
+{
+    return rb_ivar_get(self, id_end);
+}
+
+/*
+ *  call-seq:
+ *     step.length  => obj
+ *     step.size    => obj
+ *
+ *  Returns the length of <i>step</i>.
+ */
+
+static VALUE
+step_length( VALUE self )
+{
+    return rb_ivar_get(self, id_len);
+}
+
+/*
+ *  call-seq:
+ *     step.step    => obj
+ *
+ *  Returns the step of <i>step</i>.
+ */
+
+static VALUE
+step_step( VALUE self )
+{
+    return rb_ivar_get(self, id_step);
+}
+
+/*
+ *  call-seq:
+ *     step.exclude_end?    => true or false
+ *
+ *  Returns <code>true</code> if <i>step</i> excludes its end value.
+ */
+static VALUE
+step_exclude_end_p(VALUE self)
+{
+    return RTEST(rb_ivar_get(self, id_excl)) ? Qtrue : Qfalse;
+}
+
+
+/*
+ *  call-seq:
+ *     step.parameters([array_size])    => [start,step,length]
+ *
+ *  Returns the iteration parameters of <i>step</i>.  If
+ *  <i>array_sizse</i> is given, negative array index is considered.
+ */
+
+void
+nary_step_array_index(VALUE self, size_t ary_size,
+                      size_t *plen, ssize_t *pbeg, ssize_t *pstep)
+{
+    size_t len;
+    ssize_t beg=0, step=1;
+    VALUE vbeg, vend, vstep, vlen;
+    ssize_t end=ary_size;
+
+    //vbeg = rb_ivar_get(self, id_beg);
+    //vend = rb_ivar_get(self, id_end);
+    vlen = rb_ivar_get(self, id_len);
+    vstep = rb_ivar_get(self, id_step);
+    vbeg = rb_funcall(self, id_beg, 0);
+    vend = rb_funcall(self, id_end, 0);
+    //vlen = rb_funcall(self, id_len, 0);
+    //vstep = rb_funcall(self, id_step, 0);
+
+    if (RTEST(vbeg)) {
+        beg = NUM2SSIZET(vbeg);
+        if (beg<0) {
+            beg += ary_size;
+        }
+    }
+    if (RTEST(vend)) {
+        end = NUM2SSIZET(vend);
+        if (end<0) {
+            end += ary_size;
+        }
+    }
+
+    //puts("pass 1");
+
+    if (RTEST(vlen)) {
+        len = NUM2SIZET(vlen);
+        if (len>0) {
+            if (RTEST(vstep)) {
+                step = NUM2SSIZET(step);
+                if (RTEST(vbeg)) {
+                    if (RTEST(vend)) {
+                        rb_raise( rb_eStandardError, "verbose Step object" );
+                    } else {
+                        end = beg + step*(len-1);
+                    }
+                } else {
+                    if (RTEST(vend)) {
+                        if (EXCL(self)) {
+                            if (step>0) end--;
+                            if (step<0) end++;
+                        }
+                        beg = end - step*(len-1);
+                    } else {
+                        beg = 0;
+                        end = step*(len-1);
+                    }
+                }
+            } else { // no step
+                step = 1;
+                if (RTEST(vbeg)) {
+                    if (RTEST(vend)) {
+                        if (EXCL(self)) {
+                            if (beg<end) end--;
+                            if (beg>end) end++;
+                        }
+                        if (len>1)
+                            step = (end-beg)/(len-1);
+                    } else {
+                        end = beg + (len-1);
+                    }
+                } else {
+                    if (RTEST(vend)) {
+                        if (EXCL(self)) {
+                            end--;
+                        }
+                        beg = end - (len-1);
+                    } else {
+                        beg = 0;
+                        end = len-1;
+                    }
+                }
+            }
+        }
+    } else { // no len
+        if (RTEST(vstep)) {
+            step = NUM2SSIZET(vstep);
+        } else {
+            step = 1;
+        }
+        if (step>0) {
+            if (!RTEST(vbeg)) {
+                beg = 0;
+            }
+            if (!RTEST(vend)) {
+                end = ary_size-1;
+            }
+            else if (EXCL(self)) {
+                end--;
+            }
+            if (beg<=end) {
+                len = (end-beg)/step+1;
+            } else {
+                len = 0;
+            }
+        } else if (step<0) {
+            if (!RTEST(vbeg)) {
+                beg = ary_size-1;
+            }
+            if (!RTEST(vend)) {
+                end = 0;
+            }
+            else if (EXCL(self)) {
+                end++;
+            }
+            if (beg>=end) {
+                len = (beg-end)/(-step)+1;
+            } else {
+                len = 0;
+            }
+        } else {
+            rb_raise( rb_eStandardError, "step must be non-zero" );
+        }
+    }
+
+    //puts("pass 2");
+
+    if (beg<0 || beg>=(ssize_t)ary_size ||
+        end<0 || end>=(ssize_t)ary_size) {
+        rb_raise( rb_eRangeError,
+                  "beg=%"SZF"d,end=%"SZF"d is out of array size (%"SZF"u)",
+                  beg, end, ary_size );
+    }
+    if (plen) *plen = len;
+    if (pbeg) *pbeg = beg;
+    if (pstep) *pstep = step;
+}
+
+
+void
+nary_step_sequence( VALUE self, size_t *plen, double *pbeg, double *pstep )
+{
+    VALUE vbeg, vend, vstep, vlen;
+    double dbeg, dend, dstep=1, dsize, err;
+    size_t size, n;
+
+    //vbeg = rb_ivar_get(self, id_beg);
+    vbeg = rb_funcall(self, id_beg, 0);
+    dbeg = NUM2DBL(vbeg);
+
+    //vend = rb_ivar_get(self, id_end);
+    vend = rb_funcall(self, id_end, 0);
+
+    vlen = rb_ivar_get(self, id_len);
+    vstep = rb_ivar_get(self, id_step);
+    //vlen  = rb_funcall(self, id_len ,0);
+    //vstep = rb_funcall(self, id_step,0);
+
+    if (RTEST(vlen)) {
+        size = NUM2SIZET(vlen);
+
+        if (!RTEST(vstep)) {
+            if (RTEST(vend)) {
+                dend = NUM2DBL(vend);
+                if (EXCL(self)) {
+                    n = size;
+                } else {
+                    n = size-1;
+                }
+                if (n>0) {
+                    dstep = (dend-dbeg)/n;
+                } else {
+                    dstep = 1;
+                }
+            } else {
+                dstep = 1;
+            }
+        }
+    } else {
+        if (!RTEST(vstep)) {
+            dstep = 1;
+        } else {
+            dstep = NUM2DBL(vstep);
+        }
+        if (RTEST(vend)) {
+            dend = NUM2DBL(vend);
+            err = (fabs(dbeg)+fabs(dend)+fabs(dend-dbeg))/fabs(dstep)*DBL_EPSILON;
+            if (err>0.5) err=0.5;
+            dsize = (dend-dbeg)/dstep;
+            if (EXCL(self))
+                dsize -= err;
+            else
+                dsize += err;
+            dsize = floor(dsize) + 1;
+            if (dsize<0) dsize=0;
+            if (isinf(dsize) || isnan(dsize)) {
+                rb_raise(rb_eArgError, "not finite size");
+            }
+            size = dsize;
+        } else {
+            rb_raise(rb_eArgError, "cannot determine length argument");
+        }
+    }
+
+    if (plen) *plen = size;
+    if (pbeg) *pbeg = dbeg;
+    if (pstep) *pstep = dstep;
+}
+
+/*
+static VALUE
+step_each( VALUE self )
+{
+    VALUE  a;
+    double beg, step;
+    size_t i, size;
+
+    a = nary_step_parameters( self, Qnil );
+    beg  = NUM2DBL(RARRAY_PTR(a)[0]);
+    step = NUM2DBL(RARRAY_PTR(a)[1]);
+    size = NUM2SIZET(RARRAY_PTR(a)[2]);
+
+    for (i=0; i<size; i++) {
+        rb_yield(rb_float_new(beg+i*step));
+    }
+    return self;
+}
+*/
+
+static VALUE
+range_with_step( VALUE range, VALUE step )
+{
+    return nary_step_new2( range, step, Qnil );
+}
+
+static VALUE
+range_with_length( VALUE range, VALUE len )
+{
+    return nary_step_new2( range, Qnil, len );
+}
+
+
+static VALUE
+nary_s_step( int argc, VALUE *argv, VALUE mod )
+{
+    VALUE self = rb_obj_alloc(na_cStep);
+    step_initialize(argc, argv, self);
+    return self;
+}
+
+
+VALUE
+nary_is_sequence( VALUE arg )
+{
+    if ( rb_obj_is_kind_of(arg, rb_cRange) )
+        return Qtrue;
+    if ( rb_obj_is_kind_of(arg, na_cStep) )
+        return Qtrue;
+    return Qfalse;
+}
+
+
+
+void
+Init_nary_step()
+{
+    na_cStep = rb_define_class_under(cNArray, "Step", rb_cObject);
+    rb_include_module(na_cStep, rb_mEnumerable);
+    rb_define_method(na_cStep, "initialize", step_initialize, -1);
+
+    //rb_define_method(na_cStep, "each", step_each, 0);
+
+    rb_define_method(na_cStep, "first", step_first, 0);
+    rb_define_method(na_cStep, "last", step_last, 0);
+    rb_define_method(na_cStep, "begin", step_first, 0);
+    rb_define_method(na_cStep, "end", step_last, 0);
+    rb_define_method(na_cStep, "step", step_step, 0);
+    rb_define_method(na_cStep, "length", step_length, 0);
+    rb_define_method(na_cStep, "size", step_length, 0);
+    rb_define_method(na_cStep, "exclude_end?", step_exclude_end_p, 0);
+    //rb_define_method(na_cStep, "to_s", step_to_s, 0);
+    //rb_define_method(na_cStep, "inspect", step_inspect, 0);
+    //rb_define_method(na_cStep, "parameters", nary_step_parameters, 1);
+
+    rb_define_method(rb_cRange, "%", range_with_step, 1);
+    rb_define_method(rb_cRange, "*", range_with_length, 1);
+
+    rb_define_singleton_method(cNArray, "step", nary_s_step, -1);
+
+    id_beg  = rb_intern("begin");
+    id_end  = rb_intern("end");
+    id_len  = rb_intern("length");
+    id_step = rb_intern("step");
+    id_excl = rb_intern("excl");
+}
diff --git a/ext/numo/narray/struct.c b/ext/numo/narray/struct.c
new file mode 100644
index 0000000..6cc5f6e
--- /dev/null
+++ b/ext/numo/narray/struct.c
@@ -0,0 +1,885 @@
+/*
+  strut.c
+  Numerical Array Extension for Ruby
+    (C) Copyright 1999-2017 by Masahiro TANAKA
+*/
+#include <ruby.h>
+#include "numo/narray.h"
+#include "numo/template.h"
+
+#define cT numo_cStruct
+VALUE cT;
+
+static VALUE
+nst_allocate(VALUE self)
+{
+    narray_t *na;
+    char *ptr;
+    VALUE velmsz;
+
+    GetNArray(self,na);
+
+    switch(NA_TYPE(na)) {
+    case NARRAY_DATA_T:
+        ptr = NA_DATA_PTR(na);
+        if (na->size > 0 && ptr == NULL) {
+            velmsz = rb_const_get(CLASS_OF(self), rb_intern("element_byte_size"));
+            ptr = xmalloc(NUM2SIZET(velmsz) * na->size);
+            NA_DATA_PTR(na) = ptr;
+        }
+        break;
+    case NARRAY_VIEW_T:
+        rb_funcall(NA_VIEW_DATA(na), rb_intern("allocate"), 0);
+        break;
+    case NARRAY_FILEMAP_T:
+        //ptr = ((narray_filemap_t*)na)->ptr;
+        // to be implemented
+    default:
+        rb_bug("invalid narray type : %d",NA_TYPE(na));
+    }
+    return self;
+}
+
+
+static inline VALUE
+nst_definitions(VALUE nst)
+{
+    return rb_const_get(nst, rb_intern("DEFINITIONS"));
+}
+
+static VALUE
+nst_definition(VALUE nst, VALUE idx)
+{
+    long i;
+    VALUE def = nst_definitions(CLASS_OF(nst));
+    long  len = RARRAY_LEN(def);
+
+    if (TYPE(idx) == T_STRING || TYPE(idx) == T_SYMBOL) {
+        ID id  = rb_to_id(idx);
+        for (i=0; i<len; i++) {
+            VALUE key = RARRAY_AREF(RARRAY_AREF(def,i),0);
+            if (SYM2ID(key) == id) {
+                return RARRAY_AREF(def,i);
+            }
+        }
+    } else if (rb_obj_is_kind_of(idx,rb_cNumeric)) {
+        i = NUM2LONG(idx);
+        if (i<-len || i>=len) {
+            rb_raise(rb_eIndexError,"offset %ld out of range of struct(size:%ld)", i, len);
+        }
+        return RARRAY_AREF(def,i);
+    }
+    return Qnil;
+}
+
+
+
+void na_copy_array_structure(VALUE self, VALUE view);
+
+VALUE
+na_make_view_struct(VALUE self, VALUE dtype, VALUE offset)
+{
+    size_t i, n;
+    int j, k, ndim;
+    size_t *shape;
+    size_t *idx1, *idx2;
+    ssize_t stride;
+    stridx_t *stridx;
+    narray_t *na, *nt;
+    narray_view_t *na1, *na2;
+    VALUE klass;
+    volatile VALUE view;
+
+    GetNArray(self,na);
+
+    // build from Numo::Struct
+    if (rb_obj_is_kind_of(dtype,cNArray)) {
+	GetNArray(dtype,nt);
+        ndim = na->ndim + nt->ndim;
+        shape = ALLOCA_N(size_t,ndim);
+        // struct dimensions
+        for (j=0; j<na->ndim; j++) {
+            shape[j] = na->shape[j];
+        }
+        // member dimension
+        for (j=na->ndim,k=0; j<ndim; j++,k++) {
+            shape[j] = nt->shape[k];
+        }
+        klass = CLASS_OF(dtype);
+        stridx = ALLOC_N(stridx_t, ndim);
+        stride = na_dtype_elmsz(klass);
+        for (j=ndim,k=nt->ndim; k; ) {
+            SDX_SET_STRIDE(stridx[--j],stride);
+            stride *= nt->shape[--k];
+        }
+    } else {
+        ndim = na->ndim;
+        shape = ALLOCA_N(size_t,ndim);
+        for (j=0; j<ndim; j++) {
+            shape[j] = na->shape[j];
+        }
+        klass = CLASS_OF(self);
+        if (TYPE(dtype)==T_CLASS) {
+            if (RTEST(rb_class_inherited_p(dtype,cNArray))) {
+                klass = dtype;
+            }
+        }
+        stridx = ALLOC_N(stridx_t, ndim);
+    }
+
+    view = na_s_allocate_view(klass);
+    na_copy_flags(self, view);
+    GetNArrayView(view, na2);
+    na_setup_shape((narray_t*)na2, ndim, shape);
+    na2->stridx = stridx;
+
+    switch(na->type) {
+    case NARRAY_DATA_T:
+    case NARRAY_FILEMAP_T:
+        stride = nary_element_stride(self);
+        for (j=na->ndim; j--;) {
+            SDX_SET_STRIDE(na2->stridx[j], stride);
+            stride *= na->shape[j];
+        }
+        na2->offset = 0;
+        na2->data = self;
+        break;
+    case NARRAY_VIEW_T:
+        GetNArrayView(self, na1);
+        for (j=na1->base.ndim; j--; ) {
+            if (SDX_IS_INDEX(na1->stridx[j])) {
+                n = na1->base.shape[j];
+                idx1 = SDX_GET_INDEX(na1->stridx[j]);
+                idx2 = ALLOC_N(size_t, na1->base.shape[j]);
+                for (i=0; i<n; i++) {
+                    idx2[i] = idx1[i];
+                }
+                SDX_SET_INDEX(na2->stridx[j],idx2);
+            } else {
+                na2->stridx[j] = na1->stridx[j];
+            }
+        }
+        na2->offset = na1->offset;
+        na2->data = na1->data;
+        break;
+    }
+
+    if (RTEST(offset)) {
+        na2->offset += NUM2SIZET(offset);
+    }
+
+    return view;
+}
+
+
+VALUE
+nst_field_view(VALUE self, VALUE idx)
+{
+    VALUE def, type, ofs;
+
+    def = nst_definition(self, idx);
+    if (!RTEST(def)) {
+        idx = rb_funcall(idx, rb_intern("to_s"), 0);
+        rb_raise(rb_eTypeError, "Invalid field: '%s' for struct %s",
+                 StringValuePtr(idx), rb_class2name(CLASS_OF(self)));
+    }
+    type = RARRAY_AREF(def,1);
+    ofs  = RARRAY_AREF(def,2);
+    return na_make_view_struct(self, type, ofs);
+}
+
+VALUE
+nst_field(VALUE self, VALUE idx)
+{
+    VALUE obj;
+    narray_view_t *nv;
+
+    obj = nst_field_view(self,idx);
+    GetNArrayView(obj,nv);
+    if (nv->base.ndim==0) {
+        obj = rb_funcall(obj,rb_intern("extract"),0);
+    }
+    return obj;
+}
+
+VALUE
+nst_field_set(VALUE self, VALUE idx, VALUE other)
+{
+    VALUE obj;
+
+    obj = nst_field_view(self,idx);
+    rb_funcall(obj,rb_intern("store"),1,other);
+    return other;
+}
+
+
+static VALUE
+nst_method_missing(int argc, VALUE *argv, VALUE self)
+{
+    VALUE s, tag, obj;
+
+    if (argc == 2) {
+        s = rb_sym_to_s(argv[0]);
+        if (RSTRING_PTR(s)[RSTRING_LEN(s)-1] == '=') {
+            tag = rb_str_intern(rb_str_new(RSTRING_PTR(s), RSTRING_LEN(s)-1));
+            obj = nst_field(self, tag);
+            if (RTEST(obj)) {
+                rb_funcall(obj, rb_intern("store"), 1, argv[1]);
+                return argv[1];
+            }
+        }
+        return rb_call_super(argc,argv);
+    }
+    if (argc == 1) {
+        obj = nst_field(self,argv[0]);
+        if (RTEST(obj)) {
+            return obj;
+        }
+    }
+    return rb_call_super(argc,argv);
+}
+
+
+/*
+  Foo = Numo::Struct.new {
+    int8     :byte
+    float64  :float, [2,2]
+    dcomplex :compl
+  }
+ */
+static VALUE
+nst_s_new(int argc, VALUE *argv, VALUE klass)
+{
+    VALUE name=Qnil, rest, size;
+    VALUE st, members;
+    ID id;
+
+    rb_scan_args(argc, argv, "0*", &rest);
+    if (RARRAY_LEN(rest)>0) {
+        name = RARRAY_AREF(rest,0);
+        if (!NIL_P(name)) {
+            VALUE tmp = rb_check_string_type(name);
+            if (!NIL_P(tmp)) {
+                rb_ary_shift(rest);
+            } else {
+                name = Qnil;
+            }
+        }
+    }
+
+    if (NIL_P(name)) {
+        st = rb_define_class_id(name, klass);
+        rb_funcall(klass, rb_intern("inherited"), 1, st);
+    }
+    else {
+        char *cname = StringValuePtr(name);
+        id = rb_intern(cname);
+        if (!rb_is_const_id(id)) {
+            rb_name_error(id, "identifier %s needs to be constant", cname);
+        }
+        if (rb_const_defined_at(klass, id)) {
+            rb_warn("redefining constant Struct::%s", cname);
+            rb_mod_remove_const(klass, ID2SYM(id));
+        }
+        st = rb_define_class_under(klass, rb_id2name(id), klass);
+    }
+
+    rb_iv_set(st, "__members__", rb_ary_new());
+    rb_iv_set(st, "__offset__", INT2FIX(0));
+
+    if (rb_block_given_p()) {
+        rb_mod_module_eval(0, 0, st);
+    }
+
+    size = rb_iv_get(st, "__offset__");
+    members = rb_iv_get(st, "__members__");
+    //printf("size=%d\n",NUM2INT(size));
+    rb_define_const(st, CONTIGUOUS_STRIDE, size);
+    rb_define_const(st, ELEMENT_BYTE_SIZE, size);
+    rb_define_const(st, ELEMENT_BIT_SIZE,  rb_funcall(size,'*',1,INT2FIX(8)));
+
+    OBJ_FREEZE(members);
+    rb_define_const(st, "DEFINITIONS", members);
+
+    rb_define_singleton_method(st, "new", rb_class_new_instance, -1);
+    //rb_define_singleton_method(st, "[]", rb_class_new_instance, -1);
+    rb_define_method(st, "allocate", nst_allocate, 0);
+
+    return st;
+}
+
+
+static VALUE
+nstruct_add_type(VALUE type, int argc, VALUE *argv, VALUE nst)
+{
+    VALUE ofs, size;
+    ID id;
+    int i;
+    VALUE name=Qnil;
+    size_t *shape=NULL;
+    int ndim=0;
+    ssize_t stride;
+    narray_view_t *nt;
+    int j;
+
+    for (i=0; i<argc; i++) {
+        switch(TYPE(argv[i])) {
+        case T_STRING:
+        case T_SYMBOL:
+            if (NIL_P(name)) {
+                name = argv[i];
+                break;
+            }
+            rb_raise(rb_eArgError,"multiple name in struct definition");
+        case T_ARRAY:
+            if (shape) {
+                rb_raise(rb_eArgError,"multiple shape in struct definition");
+            }
+            ndim = RARRAY_LEN(argv[i]);
+            if (ndim > NA_MAX_DIMENSION) {
+                rb_raise(rb_eArgError,"too large number of dimensions");
+            }
+            if (ndim == 0) {
+                rb_raise(rb_eArgError,"array is empty");
+            }
+            shape = ALLOCA_N(size_t, ndim);
+            na_array_to_internal_shape(Qnil, argv[i], shape);
+            break;
+        }
+    }
+
+    id = rb_to_id(name);
+    name = ID2SYM(id);
+    if (rb_obj_is_kind_of(type,cNArray)) {
+        narray_t *na;
+        GetNArray(type,na);
+        type = CLASS_OF(type);
+        ndim = na->ndim;
+        shape = na->shape;
+    }
+    type = nary_view_new(type,ndim,shape);
+    GetNArrayView(type,nt);
+
+    nt->stridx = ALLOC_N(stridx_t,ndim);
+    stride = na_dtype_elmsz(CLASS_OF(type));
+    for (j=ndim; j--; ) {
+        SDX_SET_STRIDE(nt->stridx[j], stride);
+        stride *= shape[j];
+    }
+
+    ofs  = rb_iv_get(nst, "__offset__");
+    nt->offset = NUM2SIZET(ofs);
+
+    size = rb_funcall(type, rb_intern("byte_size"), 0);
+    rb_iv_set(nst, "__offset__", rb_funcall(ofs,'+',1,size));
+    rb_ary_push(rb_iv_get(nst,"__members__"),
+                rb_ary_new3(4,name,type,ofs,size));  // <- field definition
+    return Qnil;
+}
+
+
+static VALUE
+nst_extract(VALUE self)
+{
+    return self;
+}
+
+
+static void
+iter_nstruct_to_a(na_loop_t *const lp)
+{
+    long    i, len;
+    VALUE   opt, types, defs, def;
+    VALUE   elmt, velm, vary;
+    size_t  ofs, pos;
+    narray_view_t *ne;
+
+    opt = lp->option;
+    types = RARRAY_AREF(opt,0);
+    defs = RARRAY_AREF(opt,1);
+    pos = lp->args[0].iter[0].pos;
+
+    len = RARRAY_LEN(types);
+    vary = rb_ary_new2(len);
+
+    for (i=0; i<len; i++) {
+        def  = RARRAY_AREF(defs,i);
+        ofs  = NUM2SIZET(RARRAY_AREF(def,2));
+        //ofs  = NUM2SIZET(RARRAY_AREF(ofsts,i));
+        elmt = RARRAY_AREF(types,i);
+        GetNArrayView(elmt,ne);
+        ne->offset = pos + ofs;
+        if (ne->base.ndim==0) {
+            velm = rb_funcall(elmt,rb_intern("extract"),0);
+        } else {
+            velm = rb_funcall(elmt,rb_intern("to_a"),0);
+        }
+        rb_ary_push(vary, velm);
+    }
+    rb_ary_push(lp->args[1].value, vary);
+}
+
+static VALUE
+na_original_data(VALUE self)
+{
+    narray_t *na;
+    narray_view_t *nv;
+
+    GetNArray(self,na);
+    switch(na->type) {
+    case NARRAY_VIEW_T:
+        GetNArrayView(self, nv);
+        return nv->data;
+    }
+    return self;
+}
+
+static VALUE
+nst_create_member_views(VALUE self)
+{
+    VALUE defs, def, types, type, elmt;
+    long  i, len;
+    narray_view_t *ne;
+
+    defs = nst_definitions(CLASS_OF(self));
+    len = RARRAY_LEN(defs);
+    types = rb_ary_new2(len);
+    //ofsts = rb_ary_new2(len);
+    for (i=0; i<len; i++) {
+        def  = RARRAY_AREF(defs,i);
+        type = RARRAY_AREF(def,1);
+        //ofst = RARRAY_AREF(def,2);
+        elmt = na_make_view(type);
+        rb_ary_push(types, elmt);
+        //rb_ary_push(ofsts, ofst);
+        GetNArrayView(elmt,ne);
+        ne->data = na_original_data(self);
+    }
+    return rb_assoc_new(types,defs);
+}
+
+static VALUE
+nary_struct_to_a(VALUE self)
+{
+    volatile VALUE opt;
+    ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
+    ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
+    ndfunc_t ndf = {iter_nstruct_to_a, NO_LOOP, 3, 1, ain, aout};
+
+    opt = nst_create_member_views(self);
+    return na_ndloop_cast_narray_to_rarray(&ndf, self, opt);
+}
+
+
+
+// ---
+static size_t
+check_array(VALUE item) {
+    narray_t *na;
+
+    if (TYPE(item) == T_ARRAY) {
+        return 1;
+    }
+    if (RTEST(rb_obj_is_kind_of(item, cNArray))) {
+        GetNArray(item,na);
+        if (na->ndim == 1) {
+            return 1;
+        } else {
+            return 0;
+        }
+    }
+    return 0;
+}
+
+static size_t
+check_array_1d(VALUE item, size_t size) {
+    narray_t *na;
+    size_t i, len;
+
+    if (TYPE(item) == T_ARRAY) {
+        len = RARRAY_LEN(item);
+        if (size != len) {
+            return 0;
+        }
+        for (i=0; i<len; i++) {
+            if (!check_array(RARRAY_AREF(item,i))) {
+                return 0;
+            }
+        }
+        return 1;
+    }
+    if (RTEST(rb_obj_is_kind_of(item, cNArray))) {
+        GetNArray(item,na);
+        if (na->ndim == 1 && na->size == size) {
+            return 1;
+        } else {
+            return 0;
+        }
+    }
+    return 0;
+}
+
+VALUE
+nst_check_compatibility(VALUE nst, VALUE ary)
+{
+    VALUE defs, def, type, item;
+    long len, i;
+    narray_t *nt;
+
+    if (TYPE(ary) != T_ARRAY) {
+        if (nst==CLASS_OF(ary)) { // same Struct
+            return Qtrue;
+        }
+        return Qfalse;
+    }
+    defs = nst_definitions(nst);
+    len = RARRAY_LEN(defs);
+
+    if (len != RARRAY_LEN(ary)) {
+        //puts("pass2");
+        return Qfalse;
+    }
+    for (i=0; i<len; i++) {
+        def  = RARRAY_AREF(defs,i);
+        type = RARRAY_AREF(def,1);
+        GetNArray(type,nt);
+        item = RARRAY_AREF(ary,i);
+        if (nt->ndim == 0) {
+            if (check_array(item)) {
+                //puts("pass3");
+                return Qfalse;
+            }
+        } else if (nt->ndim == 1) {
+            if (!check_array_1d(item, nt->size)) {
+                //puts("pass4");
+                return Qfalse;
+            }
+        } else {
+            // multi-dimension member
+            //volatile VALUE vnc;
+            //na_compose_t *nc;
+            VALUE vnc;
+            narray_t *nc;
+            int j;
+
+            //vnc = na_ary_composition(item);
+            //Data_Get_Struct(vnc, na_compose_t, nc);
+            vnc = na_s_new_like(cNArray, item);
+            GetNArray(vnc,nc);
+            if (nt->ndim != nc->ndim) {
+                return Qfalse;
+            }
+            for (j=0; j<nc->ndim; j++) {
+                if (nc->shape[j] != nt->shape[j]) {
+                    return Qfalse;
+                }
+            }
+            return Qtrue;
+        }
+    }
+    return Qtrue;
+}
+
+
+
+VALUE na_ary_composition_for_struct(VALUE nstruct, VALUE ary);
+
+// ------
+static void
+iter_nstruct_from_a(na_loop_t *const lp)
+{
+    long  i, len;
+    VALUE ary;
+    VALUE types, defs, def;
+    VALUE elmt, item;
+    size_t ofs;
+    narray_view_t *ne;
+
+    types = RARRAY_AREF(lp->option,0);
+    defs = RARRAY_AREF(lp->option,1);
+
+    len = RARRAY_LEN(types);
+    ary = lp->args[1].value;
+    //rb_p(CLASS_OF(ary));rb_p(ary);
+
+    for (i=0; i<len; i++) {
+        def  = RARRAY_AREF(defs,i);
+        ofs  = NUM2SIZET(RARRAY_AREF(def,2));
+        elmt = RARRAY_AREF(types,i);
+        GetNArrayView(elmt,ne);
+        ne->offset = lp->args[0].iter[0].pos + ofs;
+        item = RARRAY_AREF(ary,i);
+        //rb_p(ary);
+        //rb_p(item);
+        //rb_p(elmt);
+        //abort();
+        rb_funcall(elmt, rb_intern("store"), 1, item);
+    }
+}
+
+static VALUE
+nary_struct_cast_array(VALUE klass, VALUE rary)
+{
+    //volatile VALUE vnc, nary;
+    VALUE nary;
+    narray_t *na;
+    //na_compose_t *nc;
+    VALUE opt;
+    ndfunc_arg_in_t ain[3] = {{OVERWRITE,0},{rb_cArray,0},{sym_option}};
+    ndfunc_t ndf = {iter_nstruct_from_a, NO_LOOP, 3, 0, ain, 0};
+
+    //fprintf(stderr,"rary:");rb_p(rary);
+    //fprintf(stderr,"class_of(rary):");rb_p(CLASS_OF(rary));
+
+    //vnc = na_ary_composition_for_struct(klass, rary);
+    //Data_Get_Struct(vnc, na_compose_t, nc);
+    //nary = nary_new(klass, nc->ndim, nc->shape);
+    nary = na_s_new_like(klass, rary);
+    GetNArray(nary,na);
+    //fprintf(stderr,"na->size=%lu\n",na->size);
+    //fprintf(stderr,"na->ndim=%d\n",na->ndim);
+    if (na->size>0) {
+        opt = nst_create_member_views(nary);
+        rb_funcall(nary, rb_intern("allocate"), 0);
+        na_ndloop_store_rarray2(&ndf, nary, rary, opt);
+    }
+    return nary;
+}
+
+static inline VALUE
+nary_struct_s_cast(VALUE klass, VALUE rary)
+{
+    return nary_struct_cast_array(klass, rary);
+}
+
+
+
+static void
+iter_struct_store_struct(na_loop_t *const lp)
+{
+    size_t  i, s1, s2;
+    char   *p1, *p2;
+    size_t *idx1, *idx2;
+    size_t  elmsz;
+    char   *x, *y;
+
+    INIT_COUNTER(lp, i);
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);
+    INIT_ELMSIZE(lp, 0, elmsz);
+    if (idx2) {
+        if (idx1) {
+            for (; i--;) {
+                x = (char*)(p1+*idx1); idx1++;
+                y = (char*)(p2+*idx2); idx2++;
+                memcpy(x,y,elmsz);
+            }
+        } else {
+            for (; i--;) {
+                x = (char*)p1;         p1+=s1;
+                y = (char*)(p2+*idx2); idx2++;
+                memcpy(x,y,elmsz);
+            }
+        }
+    } else {
+        if (idx1) {
+            for (; i--;) {
+                x = (char*)(p1+*idx1); idx1++;
+                y = (char*)p2;         p2+=s2;
+                memcpy(x,y,elmsz);
+            }
+        } else {
+            for (; i--;) {
+                x = (char*)p1;         p1+=s1;
+                y = (char*)p2;         p2+=s2;
+                memcpy(x,y,elmsz);
+            }
+        }
+    }
+}
+
+
+static VALUE
+nary_struct_store_struct(VALUE self, VALUE obj)
+{
+    ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{Qnil,0}};
+    ndfunc_t ndf = {iter_struct_store_struct, FULL_LOOP, 2, 0, ain, 0};
+
+    na_ndloop(&ndf, 2, self, obj);
+    return self;
+}
+
+
+
+
+static inline VALUE
+nary_struct_store_array(VALUE self, VALUE obj)
+{
+    return nary_struct_store_struct(self, nary_struct_cast_array(CLASS_OF(self),obj));
+}
+
+/*
+  Store elements to Numo::Struct from other.
+  @overload store(other)
+  @param [Object] other
+  @return [Numo::Struct] self
+*/
+static VALUE
+nary_struct_store(VALUE self, VALUE obj)
+{
+    if (TYPE(obj)==T_ARRAY) {
+        nary_struct_store_array(self,obj);
+        return self;
+    }
+    if (CLASS_OF(self) == CLASS_OF(obj)) {
+        nary_struct_store_struct(self,obj);
+        return self;
+    }
+    rb_raise(nary_eCastError, "unknown conversion from %s to %s",
+             rb_class2name(CLASS_OF(obj)),
+             rb_class2name(CLASS_OF(self)));
+    return self;
+}
+
+
+
+static VALUE
+//iter_struct_inspect(na_loop_t *const lp)
+iter_struct_inspect(char *ptr, size_t pos, VALUE opt)
+{
+    VALUE   types, defs, def, name, elmt, vary, v, x;
+    size_t  ofs;
+    long    i, len;
+    narray_view_t *ne;
+
+    types = RARRAY_AREF(opt,0);
+    defs = RARRAY_AREF(opt,1);
+
+    len = RARRAY_LEN(types);
+    vary = rb_ary_new2(len);
+
+    for (i=0; i<len; i++) {
+        def  = RARRAY_AREF(defs,i);
+        name = RARRAY_AREF(def,0);
+        ofs  = NUM2SIZET(RARRAY_AREF(def,2));
+        elmt = RARRAY_AREF(types,i);
+        GetNArrayView(elmt,ne);
+        ne->offset = pos + ofs;
+        v = rb_str_concat(rb_sym_to_s(name), rb_str_new2(": "));
+        x = rb_funcall(elmt, rb_intern("format_to_a"), 0);        // <-- fix me
+        if (ne->base.ndim==0) {
+            x = rb_funcall(x, rb_intern("first"), 0);
+        }
+        x = rb_funcall(x, rb_intern("to_s"), 0);
+        v = rb_str_concat(v, x);
+        rb_ary_push(vary, v);
+    }
+    v = rb_ary_join(vary, rb_str_new2(", "));
+    v = rb_str_concat(rb_str_new2("["), v);
+    v = rb_str_concat(v, rb_str_new2("]"));
+    return v;
+}
+
+/*
+  Returns a string containing a human-readable representation of NArray.
+  @overload inspect
+  @return [String]
+*/
+VALUE
+nary_struct_inspect(VALUE ary)
+{
+    VALUE opt;
+    opt = nst_create_member_views(ary);
+    return na_ndloop_inspect(ary, iter_struct_inspect, opt);
+}
+
+
+static VALUE
+nst_s_add_type(int argc, VALUE *argv, VALUE mod)
+{
+    if (argc==0)
+        rb_raise(rb_eArgError,
+                 "wrong number of arguments (%d for 1)", argc);
+    nstruct_add_type(argv[0],argc-1,argv+1,mod);
+    return Qnil;
+}
+
+
+
+
+#define NST_TYPEDEF(tpname,tpclass)                 \
+static VALUE                                        \
+nst_s_##tpname(VALUE argc, VALUE *argv, VALUE mod)  \
+{   nstruct_add_type(tpclass,argc,argv,mod);        \
+    return Qnil;                                    \
+}
+
+NST_TYPEDEF(int8,numo_cInt8)
+NST_TYPEDEF(int16,numo_cInt16)
+NST_TYPEDEF(int32,numo_cInt32)
+NST_TYPEDEF(int64,numo_cInt64)
+NST_TYPEDEF(uint8,numo_cUInt8)
+NST_TYPEDEF(uint16,numo_cUInt16)
+NST_TYPEDEF(uint32,numo_cUInt32)
+NST_TYPEDEF(uint64,numo_cUInt64)
+NST_TYPEDEF(dfloat,numo_cDFloat)
+NST_TYPEDEF(dcomplex,numo_cDComplex)
+NST_TYPEDEF(sfloat,numo_cSFloat)
+NST_TYPEDEF(scomplex,numo_cSComplex)
+
+
+#define rb_define_singleton_alias(klass,name1,name2) \
+    rb_define_alias(rb_singleton_class(klass),name1,name2)
+
+void
+Init_nary_struct()
+{
+    cT = rb_define_class_under(mNumo, "Struct", numo_cNArray);
+    //cNStMember = rb_define_class_under(cT, "Member", rb_cObject);
+
+    //rb_define_alloc_func(cNStMember, nst_member_s_allocate);
+    //rb_define_method(cNStMember, "initialize", nst_member_initialize, -1);
+
+    //rb_undef_alloc_func(cT);
+    rb_define_singleton_method(cT, "new", nst_s_new, -1);
+    rb_define_singleton_method(cT, "add_type", nst_s_add_type, -1);
+    rb_define_singleton_method(cT, "int8",   nst_s_int8,   -1);
+    rb_define_singleton_method(cT, "int16",  nst_s_int16,  -1);
+    rb_define_singleton_method(cT, "int32",  nst_s_int32,  -1);
+    rb_define_singleton_method(cT, "int64",  nst_s_int64,  -1);
+    rb_define_singleton_method(cT, "uint8",  nst_s_uint8,  -1);
+    rb_define_singleton_method(cT, "uint16", nst_s_uint16, -1);
+    rb_define_singleton_method(cT, "uint32", nst_s_uint32, -1);
+    rb_define_singleton_method(cT, "uint64", nst_s_uint64, -1);
+    rb_define_singleton_method(cT, "sfloat",   nst_s_sfloat, -1);
+    rb_define_singleton_alias (cT, "float32", "sfloat");
+    rb_define_singleton_method(cT, "scomplex", nst_s_scomplex, -1);
+    rb_define_singleton_alias (cT, "complex64", "scomplex");
+    rb_define_singleton_method(cT, "dfloat",   nst_s_dfloat, -1);
+    rb_define_singleton_alias (cT, "float64", "dfloat");
+    rb_define_singleton_method(cT, "dcomplex", nst_s_dcomplex, -1);
+    rb_define_singleton_alias (cT, "complex128", "dcomplex");
+
+    rb_define_method(cT, "definition", nst_definition, 1);
+    rb_define_method(cT, "definitions", nst_definitions, 0);
+    rb_define_method(cT, "field", nst_field, 1);
+    rb_define_method(cT, "field_set", nst_field_set, 2);
+    rb_define_method(cT, "extract", nst_extract, 0);
+    rb_define_method(cT, "method_missing", nst_method_missing, -1);
+
+    //rb_define_method(cT, "fill", nary_nstruct_fill, 1);
+
+    //rb_define_method(cT, "debug_print", nary_nstruct_debug_print, 0);
+
+    rb_define_method(cT, "to_a", nary_struct_to_a, 0);
+
+    rb_define_method(cT, "store", nary_struct_store, 1);
+
+    rb_define_method(cT, "inspect", nary_struct_inspect, 0);
+
+    rb_define_singleton_method(cT, "cast", nary_struct_s_cast, 1);
+    rb_define_singleton_method(cT, "[]", nary_struct_s_cast, -2);
+
+    //rb_define_method(cT, "initialize", rb_struct_initialize, -2);
+    //rb_define_method(cT, "initialize_copy", rb_struct_init_copy, 1);
+}
diff --git a/lib/erbpp.rb b/lib/erbpp.rb
new file mode 100644
index 0000000..a454b27
--- /dev/null
+++ b/lib/erbpp.rb
@@ -0,0 +1,294 @@
+require "erb"
+
+class ErbPP
+  ATTRS = []
+
+  class ParamNotSetError < StandardError; end
+
+  def self.define_attrs(attrs)
+    attrs.each do |attr|
+      ivar = ("@"+attr).to_sym
+      define_method(attr){|*a| attr_method(ivar,*a)}
+    end
+  end
+
+  def attr_method(ivar,arg=nil)
+    if arg.nil?
+      instance_variable_get(ivar)
+    else
+      instance_variable_set(ivar,arg)
+    end
+  end
+
+  def initialize(parent,erb_path,opts={})
+    parents.push(parent) if parent
+    @erb_path = erb_path
+    @tmpl = @erb_path
+
+    @opts = opts
+    if @opts.class != Hash
+      raise ArgumentError, "option is not Hash"
+    end
+
+    @opts.each do |k,v|
+      ivar = ("@"+k.to_s).to_sym
+      instance_variable_set(ivar,v)
+    end
+  end
+
+  def load_erb
+    safe_level = nil
+    trim_mode = '%<>'
+    @erb = ERB.new(File.read(@erb_path),safe_level,trim_mode)
+    @erb.filename = @erb_path
+  end
+
+  def parents
+    @parents ||= []
+  end
+
+  def search_method_in_parents(_meth_id)
+    parents.each do |x|
+      if x.has_attr? _meth_id
+        return x
+      end
+    end
+    parents.each do |x|
+      if f = x.search_method_in_parents(_meth_id)
+        return f
+      end
+    end
+    nil
+  end
+
+  def attrs
+    self.class::ATTRS
+  end
+
+  def has_attr?(_meth_id)
+    respond_to?(_meth_id) or attrs.include?(_meth_id.to_s)
+  end
+
+  def check_params(*params)
+    params.each do |x|
+      val = send(x)
+      if !val # || val.empty?
+        raise ParamNotSetError,"parameter #{x.to_s} is not set"
+      end
+    end
+  end
+
+  alias method_missing_alias method_missing
+
+  def method_missing(_meth_id, *args, &block)
+    ivar = "@"+_meth_id.to_s
+    if args.empty? and instance_variable_defined?(ivar)
+      parm = instance_variable_get(ivar)
+      if parm.nil?
+        raise ParamNotSetError,"parameter #{_meth_id.to_s} is not set"
+      end
+      parm
+    elsif args.size == 1 and attrs.include?(_meth_id.to_s)
+      instance_variable_set(ivar,args.first)
+    elsif x = search_method_in_parents(_meth_id)
+      x.send(_meth_id, *args, &block)
+    else
+      method_missing_alias(_meth_id, *args)
+    end
+  end
+
+  def run
+    load_erb unless @erb
+    @erb.run(binding)
+  end
+
+  def result
+    load_erb unless @erb
+    @erb.result(binding)
+  end
+end
+
+# ----------------------------------------------------------------------
+
+class IdVar
+  DEFS = []
+
+  def id_decl
+    "static ID #{@id_var};"
+  end
+
+  def id_assign
+    "#{@id_var} = rb_intern(\"#{@name}\");"
+  end
+
+  def initialize(parent,name,var=nil)
+    @name = name
+    var = name if var.nil?
+    @id_var = "id_"+var.gsub(/\?/,"_p").gsub(/\!/,"_bang")
+    DEFS.push(self)
+  end
+
+  def self.declaration
+    DEFS.map do |x|
+      x.id_decl
+    end
+  end
+
+  def self.assignment
+    DEFS.map do |x|
+      x.id_assign
+    end
+  end
+end
+
+# ----------------------------------------------------------------------
+
+class Function < ErbPP
+  DEFS = []
+
+  attrs = %w[
+    singleton
+    meth
+    n_arg
+  ]
+  define_attrs attrs
+
+  def id_op
+    if op.size == 1
+      "'#{op}'"
+    else
+      "id_#{method}"
+    end
+  end
+
+  def method
+    meth.gsub(/\?/,"_p").gsub(/\!/,"_bang")
+  end
+
+  def initialize(parent,tmpl,**opts)
+    super
+    @aliases = opts[:aliases] || []
+    parent.tmpl_dirs.each do |d|
+      @erb_path = File.join(d, tmpl+".c")
+      break if File.exist?(@erb_path)
+    end
+    DEFS.push(self)
+  end
+
+  def c_method
+    "#{m_prefix}#{method}"
+  end
+
+  def c_iter
+    begin
+      t = "_"+type_name
+    rescue
+      t = ""
+    end
+    "iter#{t}_#{method}"
+  end
+  alias c_iterator c_iter
+
+  def c_func
+    s = singleton ? "_s" : ""
+    begin
+      t = "_"+type_name
+    rescue
+      t = ""
+    end
+    "numo#{t}#{s}_#{method}"
+  end
+  alias c_function c_func
+  alias c_instance_method c_func
+
+  def op_map
+    @op || meth
+  end
+
+  def code
+    result + "\n\n"
+  end
+
+  def definition
+    return nil if n_arg <= -9
+    s = singleton ? "_singleton" : ""
+    check_params(:mod_var, :op_map, :c_func, :n_arg)
+    m = op_map
+    a = ["rb_define#{s}_method(#{mod_var}, \"#{m}\", #{c_func}, #{n_arg});"]
+    @aliases.map{|x| a << "rb_define_alias(#{mod_var}, \"#{x}\", \"#{m}\");"}
+    a
+  end
+
+  def self.codes
+    a = []
+    DEFS.each do |i|
+      x = i.code
+      a.push(x) if x
+    end
+    a
+  end
+
+  def self.definitions
+    a = []
+    DEFS.each do |i|
+      case x = i.definition
+      when Array
+        a.concat(x)
+      when String
+        a.push(x)
+      else
+        raise "unknown definition: #{x}" if x
+      end
+    end
+    a
+  end
+end
+
+class ModuleFunction < Function
+  def definition
+    m = op_map
+    "rb_define_module_function(#{mod_var}, \"#{m}\", #{c_func}, #{n_arg});"
+  end
+end
+
+class NodefFunction < Function
+  def definition
+    nil
+  end
+end
+
+class Alias < ErbPP
+  def initialize(parent, dst, src)
+    super(parent,nil)
+    @dst = dst
+    @src = src
+    Function::DEFS.push(self)
+  end
+
+  def code
+    nil
+  end
+
+  def definition
+    "rb_define_alias(#{mod_var}, \"#{dst}\", \"#{src}\");"
+  end
+end
+
+class Const < ErbPP
+  def initialize(parent,name,value,desc)
+    super(parent,nil)
+    @name = name
+    @value = value
+    @desc = desc
+    Function::DEFS.push(self)
+  end
+
+  def code
+    nil
+  end
+
+  def definition
+    "/*"+desc+"*/\n    "+
+      "rb_define_const(#{mod_var},\"#{name}\",#{value});"
+  end
+end
diff --git a/lib/erbpp/line_number.rb b/lib/erbpp/line_number.rb
new file mode 100644
index 0000000..cd51a60
--- /dev/null
+++ b/lib/erbpp/line_number.rb
@@ -0,0 +1,133 @@
+class CountLnString < String
+
+  def initialize(filename)
+    @filename = filename
+    @lnchar = "\n"
+    @buf = ""
+    @str = ""
+    @countln = 1
+    @current = 1
+    super("\n"+report_line)
+  end
+
+  def report_line
+    "#line #{@current} \"#{@filename}\"\n"
+  end
+
+  def concat0(s)
+    ln(caller[0])
+    @buf.concat(s)
+    @str.concat(s)
+  end
+
+  def concat1(s)
+    ln(caller[0])
+    @buf.concat(s)
+  end
+
+  def ln(status=nil)
+    case status
+    when /:(\d+):/
+      n = $1.to_i
+    else
+      n = status.to_i
+    end
+    return if n == @current
+    if @current != @countln || @postpone
+      if /\A\s*\z/ =~ @str || /\A#line / =~ @buf
+        @postpone = true
+      elsif @in_comment
+        @postpone = false
+      else
+        if self[-1] != "\n"
+          concat("\n")
+        end
+        concat(report_line)
+        @postpone = false
+      end
+    end
+    concat(@buf)
+
+    b = @buf.gsub(/".*?(?<!\\)"/,'""')
+    /^.*(\/\*)(.*?)$/ =~ b
+    x = $2
+    /^.*(\*\/)(.*?)$/ =~ b
+    y = $2
+    if x
+      if y
+        if x.size < y.size
+          #:in_comment
+          @in_comment = true
+        else
+          #:out_comment
+          @in_comment = false
+        end
+      else
+        #:in_comment
+        @in_comment = true
+      end
+    else
+      if y
+        #:out_comment
+        @in_comment = false
+      else
+        #:keep
+      end
+    end
+
+    @countln = @current + @buf.count(@lnchar)
+    @current = n
+    @buf = ""
+    @str = ""
+  end
+
+  def d(s)
+    p [s, [x,y], r]
+    r
+  end
+
+  def final
+    concat(@buf)
+  end
+
+end
+
+class ERB
+  alias result_orig result
+
+  def result(b=new_toplevel)
+    src = src_with_cpp_line
+    if @safe_level
+      proc {
+        $SAFE = @safe_level
+        eval(src, b, (@filename || '(erb)'), 0)
+      }.call
+    else
+      #open("tmpout","w"){|f| f.write src} if /dtype/=~@filename
+      eval(src, b, (@filename || '(erb)'), 0)
+    end
+  end
+
+  alias src_orig src
+
+  def src
+    src_with_cpp_line
+  end
+
+  def src_with_cpp_line
+    @src.each_line.with_index.map do |line, num|
+      line.gsub!(/_erbout.concat "/,'_erbout.concat0 "')
+      line.gsub!(/_erbout.concat\(/,'_erbout.concat1(')
+      if num==0
+        # skip
+      elsif num==1
+        f = @filename.dump
+        line.sub!(/_erbout = (''|String\.new);/, "_erbout = CountLnString.new(#{f});")
+      elsif /^; _erbout\.force_encoding/ =~ line
+        line.sub!(/^;/,";_erbout.ln(#{num});")
+      end
+      line
+    end.join+";_erbout.final;"
+  end
+
+end
diff --git a/lib/erbpp/narray_def.rb b/lib/erbpp/narray_def.rb
new file mode 100644
index 0000000..06881d3
--- /dev/null
+++ b/lib/erbpp/narray_def.rb
@@ -0,0 +1,381 @@
+require 'erbpp'
+
+module DefMethod
+
+  def def_id(meth,var=nil)
+    IdVar.new(self, meth, var)
+  end
+
+  def def_method(meth, n_arg, tmpl=nil, opts={})
+    h = {:meth => meth, :n_arg => n_arg}
+    h.merge!(opts)
+    tmpl ||= meth
+    Function.new(self, tmpl, h)
+  end
+
+  def def_singleton(meth, n_arg, tmpl=nil, opts={})
+    def_method(meth, n_arg, tmpl, :singleton => true)
+  end
+
+  def def_alias(dst, src)
+    Alias.new(self, dst, src)
+  end
+
+  def def_allocate(tmpl)
+    h = {:meth => "allocate", :singleton => true}
+    Allocate.new(self, tmpl, h)
+  end
+
+  def binary(meth, ope=nil)
+    ope = meth if ope.nil?
+    def_method(meth, 1, "binary", :op => ope)
+  end
+
+  def binary2(meth, ope=nil)
+    ope = meth if ope.nil?
+    def_method(meth, 1, "binary2", :op =>ope)
+  end
+
+  def unary(meth, ope=nil)
+    def_method(meth, 0, "unary", :op => ope)
+  end
+
+  def pow
+    def_method("pow", 1, "pow", :op => "**")
+  end
+
+  def unary2(meth, dtype, tpclass)
+    h = {:dtype => dtype, :tpclass => tpclass}
+    def_method(meth, 0, "unary2", h)
+  end
+
+  def set2(meth, dtype, tpclass)
+    h = {:dtype => dtype, :tpclass => tpclass}
+    def_method(meth, 1, "set2", h)
+  end
+
+  def cond_binary(meth,op=nil)
+    op = meth unless op
+    def_method(meth, 1, "cond_binary", :op => op)
+  end
+
+  def cond_unary(meth)
+    def_method(meth, 0, "cond_unary")
+  end
+
+  def bit_count(meth)
+    def_method(meth, -1, "bit_count")
+  end
+
+  def bit_reduce(meth, init_bit)
+    h = {:init_bit=>init_bit}
+    def_method(meth, -1, "bit_reduce", h)
+  end
+
+  def accum(meth, dtype, tpclass)
+    h = {:dtype => dtype, :tpclass => tpclass}
+    def_method(meth, -1, "accum", h)
+  end
+
+  def accum_index(meth)
+    def_method(meth, -1, "accum_index")
+  end
+
+  def cum(meth, cmacro)
+    def_method(meth, -1, "cum", cmacro:cmacro)
+  end
+
+  def accum_binary(meth, ope=nil)
+    ope = meth if ope.nil?
+    def_method(meth, -1, "accum_binary", :op => ope)
+  end
+
+  def qsort(tp, dtype, dcast, suffix="")
+    h = {:tp => tp, :dtype => dtype, :dcast => dcast, :suffix => suffix}
+    NodefFunction.new(self, "qsort", h)
+  end
+
+  def def_mod_func(meth, n_arg, tmpl=nil, opts={})
+    h = {:meth => meth, :n_arg => n_arg}
+    h.merge!(opts)
+    tmpl ||= meth
+    ModuleFunction.new(self, tmpl, h)
+  end
+
+  def math(meth, n=1, tmpl=nil)
+    h = {:mod_var => 'mTM'}
+    if tmpl.nil?
+      case n
+      when 1
+        tmpl = "unary_s"
+      when 2
+        tmpl = "binary_s"
+      when 3
+        tmpl = "ternary_s"
+      else
+        raise "invalid n=#{n}"
+      end
+    end
+    def_mod_func(meth, n, tmpl, h)
+  end
+
+  def store_numeric
+    StoreNum.new(self,"store_numeric")
+  end
+
+  def store_array
+    StoreArray.new(self,"store_array")
+  end
+
+  def cast_array
+    CastArray.new(self,"cast_array")
+  end
+
+  def store_from(cname,dtype,macro)
+    Store.new(self,"store_from",cname.downcase,dtype,"numo_c"+cname,macro)
+  end
+
+  def store_bit(cname)
+    Store.new(self,"store_bit",cname.downcase,nil,"numo_c"+cname,nil)
+  end
+
+  def store
+    Function.new(self,"store","store")
+  end
+
+  def find_method(meth)
+    Function::DEFS.find{|x| x.kind_of?(Function) and meth == x.meth }
+  end
+
+  def find_tmpl(meth)
+    Function::DEFS.find{|x| x.kind_of?(Function) and meth == x.tmpl }
+  end
+
+  def cast_func
+    "numo_#{tp}_s_cast"
+  end
+end
+
+# ----------------------------------------------------------------------
+
+class DataType < ErbPP
+  include DefMethod
+
+  def initialize(erb_path, type_file)
+    super(nil, erb_path)
+    @class_alias = []
+    @upcast = []
+    @mod_var = "cT"
+    load_type(type_file) if type_file
+    dirs = template_dir || ["tmpl"]
+    @tmpl_dirs = dirs.map{|d| File.join(File.dirname(erb_path),d)}
+  end
+
+  attr_reader :tmpl_dirs
+
+  def load_type(file)
+    eval File.read(file)
+  end
+
+  attrs = %w[
+    class_name
+    ctype
+    template_dir
+    blas_char
+    complex_class_name
+    complex_type
+    real_class_name
+    real_ctype
+    has_math
+    is_bit
+    is_int
+    is_unsigned
+    is_float
+    is_real
+    is_complex
+    is_object
+    is_comparable
+    is_double_precision
+    mod_var
+  ]
+  define_attrs attrs
+
+  def type_name
+    @type_name ||= class_name.downcase
+  end
+  alias tp type_name
+
+  def type_var
+    @type_var ||= "numo_c"+class_name
+  end
+
+  def math_var
+    @math_var ||= "numo_m"+class_name+"Math"
+  end
+
+  def real_class_name(arg=nil)
+    if arg.nil?
+      @real_class_name ||= class_name
+    else
+      @real_class_name = arg
+    end
+  end
+
+  def real_ctype(arg=nil)
+    if arg.nil?
+      @real_ctype ||= ctype
+    else
+      @real_ctype = arg
+    end
+  end
+
+  def real_type_var
+    @real_type_var ||= "numo_c"+real_class_name
+  end
+
+  def real_type_name
+    @real_type_name ||= real_class_name.downcase
+  end
+
+  def class_alias(*args)
+    @class_alias.concat(args)
+  end
+
+  def upcast(c=nil,t=nil)
+    if c
+      if t
+        t = "numo_c#{t}"
+      else
+        t = "cT"
+      end
+      @upcast << "rb_hash_aset(hCast, numo_c#{c}, #{t});"
+    else
+      @upcast
+    end
+  end
+
+  def upcast_rb(c,t=nil)
+    if t
+      t = "numo_c#{t}"
+    else
+      t = "cT"
+    end
+    if c=="Integer"
+      @upcast << "#ifdef RUBY_INTEGER_UNIFICATION"
+      @upcast << "rb_hash_aset(hCast, rb_cInteger, #{t});"
+      @upcast << "#else"
+      @upcast << "rb_hash_aset(hCast, rb_cFixnum, #{t});"
+      @upcast << "rb_hash_aset(hCast, rb_cBignum, #{t});"
+      @upcast << "#endif"
+    else
+      @upcast << "rb_hash_aset(hCast, rb_c#{c}, #{t});"
+    end
+  end
+end
+
+
+# ----------------------------------------------------------------------
+
+
+class Allocate < Function
+  def definition
+    "rb_define_alloc_func(#{mod_var}, #{c_func});"
+  end
+end
+
+# ----------------------------------------------------------------------
+
+class Store < Function
+  DEFS = []
+
+  def initialize(parent,tmpl,tpname,dtype,tpclass,macro)
+    super(parent,tmpl)
+    @tpname=tpname
+    @dtype=dtype
+    @tpclass=tpclass
+    @macro=macro
+    DEFS.push(self)
+  end
+  attr_reader :tmpl, :tpname, :dtype, :tpclass, :macro
+
+  def c_func
+    "numo_#{tp}_store_#{tpname}"
+  end
+
+  def c_iter
+    "iter_#{tp}_store_#{tpname}"
+  end
+
+  def definition
+    nil
+  end
+
+  def condition(klass)
+    "#{klass}==#{tpclass}"
+  end
+
+  def extract_data(ptr,pos,x)
+    case tpname
+    when "bit"
+      "{BIT_DIGIT b; LOAD_BIT(#{ptr},#{pos},b); x = m_from_real(b);}"
+    when "robject"
+      "#{x} = m_num_to_data(*(#{dtype}*)(#{ptr}+#{pos}))"
+    when /complex/
+      "{#{dtype} *p = (#{dtype}*)(#{ptr}+#{pos}); #{x} = c_new(REAL(*p),IMAG(*p));}"
+    else
+      "#{x} = m_from_real(*(#{dtype}*)(#{ptr}+#{pos}))"
+    end
+  end
+
+  def self.definitions
+    a = []
+    DEFS.each do |x|
+      if x.condition("")
+        if x.tpname == x.parents[0].class_name.downcase
+          a.unshift(x)
+        else
+          a.push(x)
+        end
+      end
+    end
+    a
+  end
+end
+
+class StoreNum < Store
+  def initialize(parent,tmpl)
+    super(parent,tmpl,"numeric",nil,nil,nil)
+  end
+
+  def condition(klass)
+    "IS_INTEGER_CLASS(#{klass}) || #{klass}==rb_cFloat || #{klass}==rb_cComplex"
+  end
+end
+
+class StoreArray < Store
+  def initialize(parent,tmpl)
+    super(parent,tmpl,"array",nil,nil,nil)
+  end
+
+  def c_func
+    "numo_#{tp}_#{tmpl}"
+  end
+
+  def condition(klass)
+    "#{klass}==rb_cArray"
+  end
+end
+
+class CastArray < StoreArray
+  def condition(klass)
+    nil
+  end
+
+  def c_func
+    "numo_#{tp}_cast_#{tpname}"
+  end
+
+  def c_iter
+    "iter_#{tp}_cast_#{tpname}"
+  end
+end
diff --git a/lib/numo/narray.rb b/lib/numo/narray.rb
new file mode 100644
index 0000000..e0e3143
--- /dev/null
+++ b/lib/numo/narray.rb
@@ -0,0 +1,8 @@
+begin
+  major, minor, _ = RUBY_VERSION.split(/\./)
+  require "#{major}.#{minor}/numo/narray.so"
+rescue LoadError
+  require "numo/narray.so"
+end
+
+require "numo/narray/extra"
diff --git a/lib/numo/narray/extra.rb b/lib/numo/narray/extra.rb
new file mode 100644
index 0000000..5e3c99e
--- /dev/null
+++ b/lib/numo/narray/extra.rb
@@ -0,0 +1,1262 @@
+module Numo
+  class NArray
+
+    # Return an unallocated array with the same shape and type as self.
+    def new_narray
+      self.class.new(*shape)
+    end
+
+    # Return an array of zeros with the same shape and type as self.
+    def new_zeros
+      self.class.zeros(*shape)
+    end
+
+    # Return an array of ones with the same shape and type as self.
+    def new_ones
+      self.class.ones(*shape)
+    end
+
+    # Return an array filled with value with the same shape and type as self.
+    def new_fill(value)
+      self.class.new(*shape).fill(value)
+    end
+
+    # Convert angles from radians to degrees.
+    def rad2deg
+      self * (180/Math::PI)
+    end
+
+    # Convert angles from degrees to radians.
+    def deg2rad
+      self * (Math::PI/180)
+    end
+
+    # Flip each row in the left/right direction.
+    # Same as `a[true, (-1..0).step(-1), ...]`.
+    def fliplr
+      reverse(1)
+    end
+
+    # Flip each column in the up/down direction.
+    # Same as `a[(-1..0).step(-1), ...]`.
+    def flipud
+      reverse(0)
+    end
+
+    # Multi-dimensional array indexing.
+    # Same as [] for one-dimensional NArray.
+    # Similar to numpy's tuple indexing, i.e., `a[[1,2,..],[3,4,..]]`
+    # (This method will be rewritten in C)
+    # @return [Numo::NArray] one-dimensional view of self.
+    # @example
+    #   p x = Numo::DFloat.new(3,3,3).seq
+    #   # Numo::DFloat#shape=[3,3,3]
+    #   # [[[0, 1, 2],
+    #   #   [3, 4, 5],
+    #   #   [6, 7, 8]],
+    #   #  [[9, 10, 11],
+    #   #   [12, 13, 14],
+    #   #   [15, 16, 17]],
+    #   #  [[18, 19, 20],
+    #   #   [21, 22, 23],
+    #   #   [24, 25, 26]]]
+    #
+    #   p x.at([0,1,2],[0,1,2],[-1,-2,-3])
+    #   # Numo::DFloat(view)#shape=[3]
+    #   # [2, 13, 24]
+    def at(*indices)
+      if indices.size != ndim
+        raise DimensionError, "argument length does not match dimension size"
+      end
+      idx = nil
+      stride = 1
+      (indices.size-1).downto(0) do |i|
+        ix = Int64.cast(indices[i])
+        if ix.ndim != 1
+          raise DimensionError, "index array is not one-dimensional"
+        end
+        ix[ix < 0] += shape[i]
+        if ((ix < 0) & (ix >= shape[i])).any?
+          raise IndexError, "index array is out of range"
+        end
+        if idx
+          if idx.size != ix.size
+            raise ShapeError, "index array sizes mismatch"
+          end
+          idx += ix * stride
+          stride *= shape[i]
+        else
+          idx = ix
+          stride = shape[i]
+        end
+      end
+      self[idx]
+    end
+
+    # Rotate in the plane specified by axes.
+    # @example
+    #   p a = Numo::Int32.new(2,2).seq
+    #   # Numo::Int32#shape=[2,2]
+    #   # [[0, 1],
+    #   #  [2, 3]]
+    #
+    #   p a.rot90
+    #   # Numo::Int32(view)#shape=[2,2]
+    #   # [[1, 3],
+    #   #  [0, 2]]
+    #
+    #   p a.rot90(2)
+    #   # Numo::Int32(view)#shape=[2,2]
+    #   # [[3, 2],
+    #   #  [1, 0]]
+    #
+    #   p a.rot90(3)
+    #   # Numo::Int32(view)#shape=[2,2]
+    #   # [[2, 0],
+    #   #  [3, 1]]
+    def rot90(k=1,axes=[0,1])
+      case k % 4
+      when 0
+        view
+      when 1
+        swapaxes(*axes).reverse(axes[0])
+      when 2
+        reverse(*axes)
+      when 3
+        swapaxes(*axes).reverse(axes[1])
+      end
+    end
+
+    def to_i
+      if size==1
+        self[0].to_i
+      else
+        # convert to Int?
+        raise TypeError, "can't convert #{self.class} into Integer"
+      end
+    end
+
+    def to_f
+      if size==1
+        self[0].to_f
+      else
+        # convert to DFloat?
+        raise TypeError, "can't convert #{self.class} into Float"
+      end
+    end
+
+    def to_c
+      if size==1
+        Complex(self[0])
+      else
+        # convert to DComplex?
+        raise TypeError, "can't convert #{self.class} into Complex"
+      end
+    end
+
+    # Convert the argument to an narray if not an narray.
+    def self.cast(a)
+      a.kind_of?(NArray) ? a : NArray.array_type(a).cast(a)
+    end
+
+    def self.asarray(a)
+      case a
+      when NArray
+        (a.ndim == 0) ? a[:new] : a
+      when Numeric,Range
+        self[a]
+      else
+        cast(a)
+      end
+    end
+
+    # parse matrix like matlab, octave
+    # @example
+    #   a = Numo::DFloat.parse %[
+    #    2 -3 5
+    #    4 9 7
+    #    2 -1 6
+    #   ]
+    #   => Numo::DFloat#shape=[3,3]
+    #   [[2, -3, 5],
+    #    [4, 9, 7],
+    #    [2, -1, 6]]
+
+    def self.parse(str, split1d:/\s+/, split2d:/;?$|;/,
+                   split3d:/\s*\n(\s*\n)+/m)
+      a = []
+      str.split(split3d).each do |block|
+        b = []
+        #print "b"; p block
+        block.split(split2d).each do |line|
+          #p line
+          line.strip!
+          if !line.empty?
+            c = []
+            line.split(split1d).each do |item|
+              c << eval(item.strip) if !item.empty?
+            end
+            b << c if !c.empty?
+          end
+        end
+        a << b if !b.empty?
+      end
+      if a.size==1
+        self.cast(a[0])
+      else
+        self.cast(a)
+      end
+    end
+
+    # Append values to the end of an narray.
+    # @example
+    #   a = Numo::DFloat[1, 2, 3]
+    #   p a.append([[4, 5, 6], [7, 8, 9]])
+    #   # Numo::DFloat#shape=[9]
+    #   # [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    #
+    #   a = Numo::DFloat[[1, 2, 3]]
+    #   p a.append([[4, 5, 6], [7, 8, 9]],axis:0)
+    #   # Numo::DFloat#shape=[3,3]
+    #   # [[1, 2, 3],
+    #   #  [4, 5, 6],
+    #   #  [7, 8, 9]]
+    #
+    #   a = Numo::DFloat[[1, 2, 3], [4, 5, 6]]
+    #   p a.append([7, 8, 9], axis:0)
+    #   # in `append': dimension mismatch (Numo::NArray::DimensionError)
+
+    def append(other,axis:nil)
+      other = self.class.cast(other)
+      if axis
+        if ndim != other.ndim
+          raise DimensionError, "dimension mismatch"
+        end
+        return concatenate(other,axis:axis)
+      else
+        a = self.class.zeros(size+other.size)
+        a[0...size] = self[true]
+        a[size..-1] = other[true]
+        return a
+      end
+    end
+
+    # Return a new array with sub-arrays along an axis deleted.
+    # If axis is not given, obj is applied to the flattened array.
+
+    # @example
+    #   a = Numo::DFloat[[1,2,3,4], [5,6,7,8], [9,10,11,12]]
+    #   p a.delete(1,0)
+    #   # Numo::DFloat(view)#shape=[2,4]
+    #   # [[1, 2, 3, 4],
+    #   #  [9, 10, 11, 12]]
+    #
+    #   p a.delete((0..-1).step(2),1)
+    #   # Numo::DFloat(view)#shape=[3,2]
+    #   # [[2, 4],
+    #   #  [6, 8],
+    #   #  [10, 12]]
+    #
+    #   p a.delete([1,3,5])
+    #   # Numo::DFloat(view)#shape=[9]
+    #   # [1, 3, 5, 7, 8, 9, 10, 11, 12]
+
+    def delete(indice,axis=nil)
+      if axis
+        bit = Bit.ones(shape[axis])
+        bit[indice] = 0
+        idx = [true]*ndim
+        idx[axis] = bit.where
+        return self[*idx].copy
+      else
+        bit = Bit.ones(size)
+        bit[indice] = 0
+        return self[bit.where].copy
+      end
+    end
+
+    # Insert values along the axis before the indices.
+    # @example
+    #   p a = Numo::DFloat[[1, 2], [3, 4]]
+    #   a = Numo::Int32[[1, 1], [2, 2], [3, 3]]
+    #
+    #   p a.insert(1,5)
+    #   # Numo::Int32#shape=[7]
+    #   # [1, 5, 1, 2, 2, 3, 3]
+    #
+    #   p a.insert(1, 5, axis:1)
+    #   # Numo::Int32#shape=[3,3]
+    #   # [[1, 5, 1],
+    #   #  [2, 5, 2],
+    #   #  [3, 5, 3]]
+    #
+    #   p a.insert([1], [[11],[12],[13]], axis:1)
+    #   # Numo::Int32#shape=[3,3]
+    #   # [[1, 11, 1],
+    #   #  [2, 12, 2],
+    #   #  [3, 13, 3]]
+    #
+    #   p a.insert(1, [11, 12, 13], axis:1)
+    #   # Numo::Int32#shape=[3,3]
+    #   # [[1, 11, 1],
+    #   #  [2, 12, 2],
+    #   #  [3, 13, 3]]
+    #
+    #   p a.insert([1], [11, 12, 13], axis:1)
+    #   # Numo::Int32#shape=[3,5]
+    #   # [[1, 11, 12, 13, 1],
+    #   #  [2, 11, 12, 13, 2],
+    #   #  [3, 11, 12, 13, 3]]
+    #
+    #   p b = a.flatten
+    #   # Numo::Int32(view)#shape=[6]
+    #   # [1, 1, 2, 2, 3, 3]
+    #
+    #   p b.insert(2,[15,16])
+    #   # Numo::Int32#shape=[8]
+    #   # [1, 1, 15, 16, 2, 2, 3, 3]
+    #
+    #   p b.insert([2,2],[15,16])
+    #   # Numo::Int32#shape=[8]
+    #   # [1, 1, 15, 16, 2, 2, 3, 3]
+    #
+    #   p b.insert([2,1],[15,16])
+    #   # Numo::Int32#shape=[8]
+    #   # [1, 16, 1, 15, 2, 2, 3, 3]
+    #
+    #   p b.insert([2,0,1],[15,16,17])
+    #   # Numo::Int32#shape=[9]
+    #   # [16, 1, 17, 1, 15, 2, 2, 3, 3]
+    #
+    #   p b.insert(2..3, [15, 16])
+    #   # Numo::Int32#shape=[8]
+    #   # [1, 1, 15, 2, 16, 2, 3, 3]
+    #
+    #   p b.insert(2, [7.13, 0.5])
+    #   # Numo::Int32#shape=[8]
+    #   # [1, 1, 7, 0, 2, 2, 3, 3]
+    #
+    #   p x = Numo::DFloat.new(2,4).seq
+    #   # Numo::DFloat#shape=[2,4]
+    #   # [[0, 1, 2, 3],
+    #   #  [4, 5, 6, 7]]
+    #
+    #   p x.insert([1,3],999,axis:1)
+    #   # Numo::DFloat#shape=[2,6]
+    #   # [[0, 999, 1, 2, 999, 3],
+    #   #  [4, 999, 5, 6, 999, 7]]
+
+    def insert(indice,values,axis:nil)
+      if axis
+        values = self.class.asarray(values)
+        nd = values.ndim
+        midx = [:new]*(ndim-nd) + [true]*nd
+        case indice
+        when Numeric
+          midx[-nd-1] = true
+          midx[axis] = :new
+        end
+        values = values[*midx]
+      else
+        values = self.class.asarray(values).flatten
+      end
+      idx = Int64.asarray(indice)
+      nidx = idx.size
+      if nidx == 1
+        nidx = values.shape[axis||0]
+        idx = idx + Int64.new(nidx).seq
+      else
+        sidx = idx.sort_index
+        idx[sidx] += Int64.new(nidx).seq
+      end
+      if axis
+        bit = Bit.ones(shape[axis]+nidx)
+        bit[idx] = 0
+        new_shape = shape
+        new_shape[axis] += nidx
+        a = self.class.zeros(new_shape)
+        mdidx = [true]*ndim
+        mdidx[axis] = bit.where
+        a[*mdidx] = self
+        mdidx[axis] = idx
+        a[*mdidx] = values
+      else
+        bit = Bit.ones(size+nidx)
+        bit[idx] = 0
+        a = self.class.zeros(size+nidx)
+        a[bit.where] = self.flatten
+        a[idx] = values
+      end
+      return a
+    end
+
+    class << self
+    # @example
+    #   p a = Numo::DFloat[[1, 2], [3, 4]]
+    #   # Numo::DFloat#shape=[2,2]
+    #   # [[1, 2],
+    #   #  [3, 4]]
+    #
+    #   p b = Numo::DFloat[[5, 6]]
+    #   # Numo::DFloat#shape=[1,2]
+    #   # [[5, 6]]
+    #
+    #   p Numo::NArray.concatenate([a,b],axis:0)
+    #   # Numo::DFloat#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [3, 4],
+    #   #  [5, 6]]
+    #
+    #   p Numo::NArray.concatenate([a,b.transpose], axis:1)
+    #   # Numo::DFloat#shape=[2,3]
+    #   # [[1, 2, 5],
+    #   #  [3, 4, 6]]
+
+    def concatenate(arrays,axis:0)
+      klass = (self==NArray) ? NArray.array_type(arrays) : self
+      nd = 0
+      arrays = arrays.map do |a|
+        case a
+        when NArray
+          # ok
+        when Numeric
+          a = klass[a]
+        when Array
+          a = klass.cast(a)
+        else
+          raise TypeError,"not Numo::NArray: #{a.inspect[0..48]}"
+        end
+        if a.ndim > nd
+          nd = a.ndim
+        end
+        a
+      end
+      if axis < 0
+        axis += nd
+      end
+      if axis < 0 || axis >= nd
+        raise ArgumentError,"axis is out of range"
+      end
+      new_shape = nil
+      sum_size = 0
+      arrays.each do |a|
+        a_shape = a.shape
+        if nd != a_shape.size
+          a_shape = [1]*(nd-a_shape.size) + a_shape
+        end
+        sum_size += a_shape.delete_at(axis)
+        if new_shape
+          if new_shape != a_shape
+            raise ShapeError,"shape mismatch"
+          end
+        else
+          new_shape = a_shape
+        end
+      end
+      new_shape.insert(axis,sum_size)
+      result = klass.zeros(*new_shape)
+      lst = 0
+      refs = [true] * nd
+      arrays.each do |a|
+        fst = lst
+        lst = fst + (a.shape[axis-nd]||1)
+        refs[axis] = fst...lst
+        result[*refs] = a
+      end
+      result
+    end
+
+    # Stack arrays vertically (row wise).
+    # @example
+    #   a = Numo::Int32[1,2,3]
+    #   b = Numo::Int32[2,3,4]
+    #   p Numo::NArray.vstack([a,b])
+    #   # Numo::Int32#shape=[2,3]
+    #   # [[1, 2, 3],
+    #   #  [2, 3, 4]]
+    #
+    #   a = Numo::Int32[[1],[2],[3]]
+    #   b = Numo::Int32[[2],[3],[4]]
+    #   p Numo::NArray.vstack([a,b])
+    #   # Numo::Int32#shape=[6,1]
+    #   # [[1],
+    #   #  [2],
+    #   #  [3],
+    #   #  [2],
+    #   #  [3],
+    #   #  [4]]
+
+    def vstack(arrays)
+      arys = arrays.map do |a|
+        _atleast_2d(cast(a))
+      end
+      concatenate(arys,axis:0)
+    end
+
+    # Stack arrays horizontally (column wise).
+    # @example
+    #   a = Numo::Int32[1,2,3]
+    #   b = Numo::Int32[2,3,4]
+    #   p Numo::NArray.hstack([a,b])
+    #   # Numo::Int32#shape=[6]
+    #   # [1, 2, 3, 2, 3, 4]
+    #
+    #   a = Numo::Int32[[1],[2],[3]]
+    #   b = Numo::Int32[[2],[3],[4]]
+    #   p Numo::NArray.hstack([a,b])
+    #   # Numo::Int32#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [2, 3],
+    #   #  [3, 4]]
+
+    def hstack(arrays)
+      klass = (self==NArray) ? NArray.array_type(arrays) : self
+      nd = 0
+      arys = arrays.map do |a|
+        a = klass.cast(a)
+        nd = a.ndim if a.ndim > nd
+        a
+      end
+      dim = (nd >= 2) ? 1 : 0
+      concatenate(arys,axis:dim)
+    end
+
+    # Stack arrays in depth wise (along third axis).
+    # @example
+    #   a = Numo::Int32[1,2,3]
+    #   b = Numo::Int32[2,3,4]
+    #   p Numo::NArray.dstack([a,b])
+    #   # Numo::Int32#shape=[1,3,2]
+    #   # [[[1, 2],
+    #   #   [2, 3],
+    #   #   [3, 4]]]
+    #
+    #   a = Numo::Int32[[1],[2],[3]]
+    #   b = Numo::Int32[[2],[3],[4]]
+    #   p Numo::NArray.dstack([a,b])
+    #   # Numo::Int32#shape=[3,1,2]
+    #   # [[[1, 2]],
+    #   #  [[2, 3]],
+    #   #  [[3, 4]]]
+
+    def dstack(arrays)
+      arys = arrays.map do |a|
+        _atleast_3d(cast(a))
+      end
+      concatenate(arys,axis:2)
+    end
+
+    # Stack 1-d arrays into columns of a 2-d array.
+    # @example
+    #   x = Numo::Int32[1,2,3]
+    #   y = Numo::Int32[2,3,4]
+    #   p Numo::NArray.column_stack([x,y])
+    #   # Numo::Int32#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [2, 3],
+    #   #  [3, 4]]
+
+    def column_stack(arrays)
+      arys = arrays.map do |a|
+        a = cast(a)
+        case a.ndim
+        when 0; a[:new,:new]
+        when 1; a[true,:new]
+        else; a
+        end
+      end
+      concatenate(arys,axis:1)
+    end
+
+    private
+    # Return an narray with at least two dimension.
+    def _atleast_2d(a)
+      case a.ndim
+      when 0; a[:new,:new]
+      when 1; a[:new,true]
+      else;   a
+      end
+    end
+
+    # Return an narray with at least three dimension.
+    def _atleast_3d(a)
+      case a.ndim
+      when 0; a[:new,:new,:new]
+      when 1; a[:new,true,:new]
+      when 2; a[true,true,:new]
+      else;   a
+      end
+    end
+
+    end # class << self
+
+    # @example
+    #   p a = Numo::DFloat[[1, 2], [3, 4]]
+    #   # Numo::DFloat#shape=[2,2]
+    #   # [[1, 2],
+    #   #  [3, 4]]
+    #
+    #   p b = Numo::DFloat[[5, 6]]
+    #   # Numo::DFloat#shape=[1,2]
+    #   # [[5, 6]]
+    #
+    #   p a.concatenate(b,axis:0)
+    #   # Numo::DFloat#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [3, 4],
+    #   #  [5, 6]]
+    #
+    #   p a.concatenate(b.transpose, axis:1)
+    #   # Numo::DFloat#shape=[2,3]
+    #   # [[1, 2, 5],
+    #   #  [3, 4, 6]]
+
+    def concatenate(*arrays,axis:0)
+      axis = check_axis(axis)
+      self_shape = shape
+      self_shape.delete_at(axis)
+      sum_size = shape[axis]
+      arrays.map! do |a|
+        case a
+        when NArray
+          # ok
+        when Numeric
+          a = self.class.new(1).store(a)
+        when Array
+          a = self.class.cast(a)
+        else
+          raise TypeError,"not Numo::NArray: #{a.inspect[0..48]}"
+        end
+        if a.ndim > ndim
+          raise ShapeError,"dimension mismatch"
+        end
+        a_shape = a.shape
+        sum_size += a_shape.delete_at(axis-ndim) || 1
+        if self_shape != a_shape
+          raise ShapeError,"shape mismatch"
+        end
+        a
+      end
+      self_shape.insert(axis,sum_size)
+      result = self.class.zeros(*self_shape)
+      lst = shape[axis]
+      refs = [true] * ndim
+      refs[axis] = 0...lst
+      result[*refs] = self
+      arrays.each do |a|
+        fst = lst
+        lst = fst + (a.shape[axis-ndim] || 1)
+        refs[axis] = fst...lst
+        result[*refs] = a
+      end
+      result
+    end
+
+    # @example
+    #   p x = Numo::DFloat.new(9).seq
+    #   # Numo::DFloat#shape=[9]
+    #   # [0, 1, 2, 3, 4, 5, 6, 7, 8]
+    #
+    #   pp x.split(3)
+    #   # [Numo::DFloat(view)#shape=[3]
+    #   # [0, 1, 2],
+    #   #  Numo::DFloat(view)#shape=[3]
+    #   # [3, 4, 5],
+    #   #  Numo::DFloat(view)#shape=[3]
+    #   # [6, 7, 8]]
+    #
+    #   p x = Numo::DFloat.new(8).seq
+    #   # Numo::DFloat#shape=[8]
+    #   # [0, 1, 2, 3, 4, 5, 6, 7]
+    #
+    #   pp x.split([3, 5, 6, 10])
+    #   # [Numo::DFloat(view)#shape=[3]
+    #   # [0, 1, 2],
+    #   #  Numo::DFloat(view)#shape=[2]
+    #   # [3, 4],
+    #   #  Numo::DFloat(view)#shape=[1]
+    #   # [5],
+    #   #  Numo::DFloat(view)#shape=[2]
+    #   # [6, 7],
+    #   #  Numo::DFloat(view)#shape=[0][]]
+
+    def split(indices_or_sections, axis:0)
+      axis = check_axis(axis)
+      size_axis = shape[axis]
+      case indices_or_sections
+      when Integer
+        div_axis, mod_axis = size_axis.divmod(indices_or_sections)
+        refs = [true]*ndim
+        beg_idx = 0
+        mod_axis.times.map do |i|
+          end_idx = beg_idx + div_axis + 1
+          refs[axis] = beg_idx ... end_idx
+          beg_idx = end_idx
+          self[*refs]
+        end +
+        (indices_or_sections-mod_axis).times.map do |i|
+          end_idx = beg_idx + div_axis
+          refs[axis] = beg_idx ... end_idx
+          beg_idx = end_idx
+          self[*refs]
+        end
+      when NArray
+        split(indices_or_sections.to_a,axis:axis)
+      when Array
+        refs = [true]*ndim
+        fst = 0
+        (indices_or_sections + [size_axis]).map do |lst|
+          lst = size_axis if lst > size_axis
+          refs[axis] = (fst < size_axis) ? fst...lst : -1...-1
+          fst = lst
+          self[*refs]
+        end
+      else
+        raise TypeError,"argument must be Integer or Array"
+      end
+    end
+
+    # @example
+    #   p x = Numo::DFloat.new(4,4).seq
+    #   # Numo::DFloat#shape=[4,4]
+    #   # [[0, 1, 2, 3],
+    #   #  [4, 5, 6, 7],
+    #   #  [8, 9, 10, 11],
+    #   #  [12, 13, 14, 15]]
+    #
+    #   pp x.hsplit(2)
+    #   # [Numo::DFloat(view)#shape=[4,2]
+    #   # [[0, 1],
+    #   #  [4, 5],
+    #   #  [8, 9],
+    #   #  [12, 13]],
+    #   #  Numo::DFloat(view)#shape=[4,2]
+    #   # [[2, 3],
+    #   #  [6, 7],
+    #   #  [10, 11],
+    #   #  [14, 15]]]
+    #
+    #   pp x.hsplit([3, 6])
+    #   # [Numo::DFloat(view)#shape=[4,3]
+    #   # [[0, 1, 2],
+    #   #  [4, 5, 6],
+    #   #  [8, 9, 10],
+    #   #  [12, 13, 14]],
+    #   #  Numo::DFloat(view)#shape=[4,1]
+    #   # [[3],
+    #   #  [7],
+    #   #  [11],
+    #   #  [15]],
+    #   #  Numo::DFloat(view)#shape=[4,0][]]
+
+    def vsplit(indices_or_sections)
+      split(indices_or_sections, axis:0)
+    end
+
+    def hsplit(indices_or_sections)
+      split(indices_or_sections, axis:1)
+    end
+
+    def dsplit(indices_or_sections)
+      split(indices_or_sections, axis:2)
+    end
+
+    # @example
+    #   p a = Numo::NArray[0,1,2]
+    #   # Numo::Int32#shape=[3]
+    #   # [0, 1, 2]
+    #
+    #   p a.tile(2)
+    #   # Numo::Int32#shape=[6]
+    #   # [0, 1, 2, 0, 1, 2]
+    #
+    #   p a.tile(2,2)
+    #   # Numo::Int32#shape=[2,6]
+    #   # [[0, 1, 2, 0, 1, 2],
+    #   #  [0, 1, 2, 0, 1, 2]]
+    #
+    #   p a.tile(2,1,2)
+    #   # Numo::Int32#shape=[2,1,6]
+    #   # [[[0, 1, 2, 0, 1, 2]],
+    #   #  [[0, 1, 2, 0, 1, 2]]]
+    #
+    #   p b = Numo::NArray[[1, 2], [3, 4]]
+    #   # Numo::Int32#shape=[2,2]
+    #   # [[1, 2],
+    #   #  [3, 4]]
+    #
+    #   p b.tile(2)
+    #   # Numo::Int32#shape=[2,4]
+    #   # [[1, 2, 1, 2],
+    #   #  [3, 4, 3, 4]]
+    #
+    #   p b.tile(2,1)
+    #   # Numo::Int32#shape=[4,2]
+    #   # [[1, 2],
+    #   #  [3, 4],
+    #   #  [1, 2],
+    #   #  [3, 4]]
+    #
+    #   p c = Numo::NArray[1,2,3,4]
+    #   # Numo::Int32#shape=[4]
+    #   # [1, 2, 3, 4]
+    #
+    #   p c.tile(4,1)
+    #   # Numo::Int32#shape=[4,4]
+    #   # [[1, 2, 3, 4],
+    #   #  [1, 2, 3, 4],
+    #   #  [1, 2, 3, 4],
+    #   #  [1, 2, 3, 4]]
+
+    def tile(*arg)
+      arg.each do |i|
+        if !i.kind_of?(Integer) || i<1
+          raise ArgumentError,"argument should be positive integer"
+        end
+      end
+      ns = arg.size
+      nd = self.ndim
+      shp = self.shape
+      new_shp = []
+      src_shp = []
+      res_shp = []
+      (nd-ns).times do
+        new_shp << 1
+        new_shp << (n = shp.shift)
+        src_shp << :new
+        src_shp << true
+        res_shp << n
+      end
+      (ns-nd).times do
+        new_shp << (m = arg.shift)
+        new_shp << 1
+        src_shp << :new
+        src_shp << :new
+        res_shp << m
+      end
+      [nd,ns].min.times do
+        new_shp << (m = arg.shift)
+        new_shp << (n = shp.shift)
+        src_shp << :new
+        src_shp << true
+        res_shp << n*m
+      end
+      self.class.new(*new_shp).store(self[*src_shp]).reshape(*res_shp)
+    end
+
+    # @example
+    #   p Numo::NArray[3].repeat(4)
+    #   # Numo::Int32#shape=[4]
+    #   # [3, 3, 3, 3]
+    #
+    #   p x = Numo::NArray[[1,2],[3,4]]
+    #   # Numo::Int32#shape=[2,2]
+    #   # [[1, 2],
+    #   #  [3, 4]]
+    #
+    #   p x.repeat(2)
+    #   # Numo::Int32#shape=[8]
+    #   # [1, 1, 2, 2, 3, 3, 4, 4]
+    #
+    #   p x.repeat(3,axis:1)
+    #   # Numo::Int32#shape=[2,6]
+    #   # [[1, 1, 1, 2, 2, 2],
+    #   #  [3, 3, 3, 4, 4, 4]]
+    #
+    #   p x.repeat([1,2],axis:0)
+    #   # Numo::Int32#shape=[3,2]
+    #   # [[1, 2],
+    #   #  [3, 4],
+    #   #  [3, 4]]
+
+    def repeat(arg,axis:nil)
+      case axis
+      when Integer
+        axis = check_axis(axis)
+        c = self
+      when NilClass
+        c = self.flatten
+        axis = 0
+      else
+        raise ArgumentError,"invalid axis"
+      end
+      case arg
+      when Integer
+        if !arg.kind_of?(Integer) || arg<1
+          raise ArgumentError,"argument should be positive integer"
+        end
+        idx = c.shape[axis].times.map{|i| [i]*arg}.flatten
+      else
+        arg = arg.to_a
+        if arg.size != c.shape[axis]
+          raise ArgumentError,"repeat size shoud be equal to size along axis"
+        end
+        arg.each do |i|
+          if !i.kind_of?(Integer) || i<0
+            raise ArgumentError,"argument should be non-negative integer"
+          end
+        end
+        idx = arg.each_with_index.map{|a,i| [i]*a}.flatten
+      end
+      ref = [true] * c.ndim
+      ref[axis] = idx
+      c[*ref].copy
+    end
+
+    # Calculate the n-th discrete difference along given axis.
+    # @example
+    #   p x = Numo::DFloat[1, 2, 4, 7, 0]
+    #   # Numo::DFloat#shape=[5]
+    #   # [1, 2, 4, 7, 0]
+    #
+    #   p x.diff
+    #   # Numo::DFloat#shape=[4]
+    #   # [1, 2, 3, -7]
+    #
+    #   p x.diff(2)
+    #   # Numo::DFloat#shape=[3]
+    #   # [1, 1, -10]
+    #
+    #   p x = Numo::DFloat[[1, 3, 6, 10], [0, 5, 6, 8]]
+    #   # Numo::DFloat#shape=[2,4]
+    #   # [[1, 3, 6, 10],
+    #   #  [0, 5, 6, 8]]
+    #
+    #   p x.diff
+    #   # Numo::DFloat#shape=[2,3]
+    #   # [[2, 3, 4],
+    #   #  [5, 1, 2]]
+    #
+    #   p x.diff(axis:0)
+    #   # Numo::DFloat#shape=[1,4]
+    #   # [[-1, 2, 0, -2]]
+
+    def diff(n=1,axis:-1)
+      axis = check_axis(axis)
+      if n < 0 || n >= shape[axis]
+        raise ShapeError,"n=#{n} is invalid for shape[#{axis}]=#{shape[axis]}"
+      end
+      # calculate polynomial coefficient
+      c = self.class[-1,1]
+      2.upto(n) do |i|
+        x = self.class.zeros(i+1)
+        x[0..-2] = c
+        y = self.class.zeros(i+1)
+        y[1..-1] = c
+        c = y - x
+      end
+      s = [true]*ndim
+      s[axis] = n..-1
+      result = self[*s].dup
+      sum = result.inplace
+      (n-1).downto(0) do |i|
+        s = [true]*ndim
+        s[axis] = i..-n-1+i
+        sum + self[*s] * c[i] # inplace addition
+      end
+      return result
+    end
+
+
+    # Upper triangular matrix.
+    # Return a copy with the elements below the k-th diagonal filled with zero.
+    def triu(k=0)
+      dup.triu!(k)
+    end
+
+    # Upper triangular matrix.
+    # Fill the self elements below the k-th diagonal with zero.
+    def triu!(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      if contiguous?
+        *shp,m,n = shape
+        idx = tril_indices(k-1)
+        reshape!(*shp,m*n)
+        self[false,idx] = 0
+        reshape!(*shp,m,n)
+      else
+        store(triu(k))
+      end
+    end
+
+    # Return the indices for the uppler-triangle on and above the k-th diagonal.
+    def triu_indices(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      m,n = shape[-2..-1]
+      NArray.triu_indices(m,n,k=0)
+    end
+
+    # Return the indices for the uppler-triangle on and above the k-th diagonal.
+    def self.triu_indices(m,n,k=0)
+      x = Numo::Int64.new(m,1).seq + k
+      y = Numo::Int64.new(1,n).seq
+      (x<=y).where
+    end
+
+    # Lower triangular matrix.
+    # Return a copy with the elements above the k-th diagonal filled with zero.
+    def tril(k=0)
+      dup.tril!(k)
+    end
+
+    # Lower triangular matrix.
+    # Fill the self elements above the k-th diagonal with zero.
+    def tril!(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      if contiguous?
+        idx = triu_indices(k+1)
+        *shp,m,n = shape
+        reshape!(*shp,m*n)
+        self[false,idx] = 0
+        reshape!(*shp,m,n)
+      else
+        store(tril(k))
+      end
+    end
+
+    # Return the indices for the lower-triangle on and below the k-th diagonal.
+    def tril_indices(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      m,n = shape[-2..-1]
+      NArray.tril_indices(m,n,k)
+    end
+
+    # Return the indices for the lower-triangle on and below the k-th diagonal.
+    def self.tril_indices(m,n,k=0)
+      x = Numo::Int64.new(m,1).seq + k
+      y = Numo::Int64.new(1,n).seq
+      (x>=y).where
+    end
+
+    # Return the k-th diagonal indices.
+    def diag_indices(k=0)
+      if ndim < 2
+        raise NArray::ShapeError, "must be >= 2-dimensional array"
+      end
+      m,n = shape[-2..-1]
+      NArray.diag_indices(m,n,k)
+    end
+
+    # Return the k-th diagonal indices.
+    def self.diag_indices(m,n,k=0)
+      x = Numo::Int64.new(m,1).seq + k
+      y = Numo::Int64.new(1,n).seq
+      (x.eq y).where
+    end
+
+    # Return a matrix whose diagonal is constructed by self along the last axis.
+    def diag(k=0)
+      *shp,n = shape
+      n += k.abs
+      a = self.class.zeros(*shp,n,n)
+      a.diagonal(k).store(self)
+      a
+    end
+
+    # Return the sum along diagonals of the array.
+    #
+    # If 2-D array, computes the summation along its diagonal with the
+    # given offset, i.e., sum of `a[i,i+offset]`.
+    # If more than 2-D array, the diagonal is determined from the axes
+    # specified by axis argument. The default is axis=[-2,-1].
+    # @param offset [Integer] (optional, default=0) diagonal offset
+    # @param axis [Array] (optional, default=[-2,-1]) diagonal axis
+    # @param nan [Bool] (optional, default=false) nan-aware algorithm, i.e., if true then it ignores nan.
+
+    def trace(offset=nil,axis=nil,nan:false)
+      diagonal(offset,axis).sum(nan:nan,axis:-1)
+    end
+
+
+    @@warn_slow_dot = false
+
+    # Dot product of two arrays.
+    # @param b [Numo::NArray]
+    # @return [Numo::NArray]  return dot product
+
+    def dot(b)
+      t = self.class::UPCAST[b.class]
+      if defined?(Linalg) && [SFloat,DFloat,SComplex,DComplex].include?(t)
+        Linalg.dot(self,b)
+      else
+        b = self.class.asarray(b)
+        case b.ndim
+        when 1
+          mulsum(b, axis:-1)
+        else
+          case ndim
+          when 0
+            b.mulsum(self, axis:-2)
+          when 1
+            self[true,:new].mulsum(b, axis:-2)
+          else
+            unless @@warn_slow_dot
+              nx = 200
+              ns = 200000
+              am,an = shape[-2..-1]
+              bm,bn = b.shape[-2..-1]
+              if am > nx && an > nx && bm > nx && bn > nx &&
+                  size > ns && b.size > ns
+                @@warn_slow_dot = true
+                warn "\nwarning: Built-in matrix dot is slow. Consider installing Numo::Linalg.\n\n"
+              end
+            end
+            self[false,:new].mulsum(b[false,:new,true,true], axis:-2)
+          end
+        end
+      end
+    end
+
+    # Inner product of two arrays.
+    # Same as `(a*b).sum(axis:-1)`.
+    # @param b [Numo::NArray]
+    # @param axis [Integer] applied axis
+    # @return [Numo::NArray]  return (a*b).sum(axis:axis)
+
+    def inner(b, axis:-1)
+      mulsum(b, axis:axis)
+    end
+
+    # Outer product of two arrays.
+    # Same as `self[false,:new] * b[false,:new,true]`.
+    #
+    # @param b [Numo::NArray]
+    # @param axis [Integer] applied axis (default=-1)
+    # @return [Numo::NArray]  return outer product
+    # @example
+    #   a = Numo::DFloat.ones(5)
+    #   => Numo::DFloat#shape=[5]
+    #   [1, 1, 1, 1, 1]
+    #   b = Numo::DFloat.linspace(-2,2,5)
+    #   => Numo::DFloat#shape=[5]
+    #   [-2, -1, 0, 1, 2]
+    #   a.outer(b)
+    #   => Numo::DFloat#shape=[5,5]
+    #   [[-2, -1, 0, 1, 2],
+    #    [-2, -1, 0, 1, 2],
+    #    [-2, -1, 0, 1, 2],
+    #    [-2, -1, 0, 1, 2],
+    #    [-2, -1, 0, 1, 2]]
+
+    def outer(b, axis:nil)
+      b = NArray.cast(b)
+      if axis.nil?
+        self[false,:new] * ((b.ndim==0) ? b : b[false,:new,true])
+      else
+        md,nd = [ndim,b.ndim].minmax
+        axis = check_axis(axis) - nd
+        if axis < -md
+          raise ArgumentError,"axis=#{axis} is out of range"
+        end
+        adim = [true]*ndim
+        adim[axis+ndim+1,0] = :new
+        bdim = [true]*b.ndim
+        bdim[axis+b.ndim,0] = :new
+        self[*adim] * b[*bdim]
+      end
+    end
+
+    # Kronecker product of two arrays.
+    #
+    #     kron(a,b)[k_0, k_1, ...] = a[i_0, i_1, ...] * b[j_0, j_1, ...]
+    #        where:  k_n = i_n * b.shape[n] + j_n
+    #
+    # @param b [Numo::NArray]
+    # @return [Numo::NArray]  return Kronecker product
+    # @example
+    #   Numo::DFloat[1,10,100].kron([5,6,7])
+    #   => Numo::DFloat#shape=[9]
+    #   [5, 6, 7, 50, 60, 70, 500, 600, 700]
+    #   Numo::DFloat[5,6,7].kron([1,10,100])
+    #   => Numo::DFloat#shape=[9]
+    #   [5, 50, 500, 6, 60, 600, 7, 70, 700]
+    #   Numo::DFloat.eye(2).kron(Numo::DFloat.ones(2,2))
+    #   => Numo::DFloat#shape=[4,4]
+    #   [[1, 1, 0, 0],
+    #    [1, 1, 0, 0],
+    #    [0, 0, 1, 1],
+    #    [0, 0, 1, 1]]
+
+    def kron(b)
+      b = NArray.cast(b)
+      nda = ndim
+      ndb = b.ndim
+      shpa = shape
+      shpb = b.shape
+      adim = [:new]*(2*[ndb-nda,0].max) + [true,:new]*nda
+      bdim = [:new]*(2*[nda-ndb,0].max) + [:new,true]*ndb
+      shpr = (-[nda,ndb].max..-1).map{|i| (shpa[i]||1) * (shpb[i]||1)}
+      (self[*adim] * b[*bdim]).reshape(*shpr)
+    end
+
+
+    # under construction
+    def cov(y=nil, ddof:1, fweights:nil, aweights:nil)
+      if y
+        m = NArray.vstack([self,y])
+      else
+        m = self
+      end
+      w = nil
+      if fweights
+        f = fweights
+        w = f
+      end
+      if aweights
+        a = aweights
+        w = w ? w*a : a
+      end
+      if w
+        w_sum = w.sum(axis:-1, keepdims:true)
+        if ddof == 0
+          fact = w_sum
+        elsif aweights.nil?
+          fact = w_sum - ddof
+        else
+          wa_sum = (w*a).sum(axis:-1, keepdims:true)
+          fact = w_sum - ddof * wa_sum / w_sum
+        end
+        if (fact <= 0).any?
+          raise StandardError,"Degrees of freedom <= 0 for slice"
+        end
+      else
+        fact = m.shape[-1] - ddof
+      end
+      if w
+        m -= (m*w).sum(axis:-1, keepdims:true) / w_sum
+        mw = m*w
+      else
+        m -= m.mean(axis:-1, keepdims:true)
+        mw = m
+      end
+      mt = (m.ndim < 2) ? m : m.swapaxes(-2,-1)
+      mw.dot(mt.conj) / fact
+    end
+
+    private
+
+    # @!visibility private
+    def check_axis(axis)
+      unless Integer===axis
+        raise ArgumentError,"axis=#{axis} must be Integer"
+      end
+      a = axis
+      if a < 0
+        a += ndim
+      end
+      if a < 0 || a >= ndim
+        raise ArgumentError,"axis=#{axis} is invalid"
+      end
+      a
+    end
+
+  end
+end
diff --git a/numo-narray.gemspec b/numo-narray.gemspec
new file mode 100644
index 0000000..6357438
--- /dev/null
+++ b/numo-narray.gemspec
@@ -0,0 +1,36 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+
+open("ext/numo/narray/numo/narray.h") do |f|
+  f.each_line do |l|
+    if /NARRAY_VERSION "([\d.]+)"/ =~ l
+      NARRAY_VERSION = $1
+      break
+    end
+  end
+end
+
+Gem::Specification.new do |spec|
+  spec.name          = "numo-narray"
+  spec.version       = NARRAY_VERSION
+  spec.authors       = ["Masahiro TANAKA"]
+  spec.email         = ["masa16.tanaka at gmail.com"]
+  spec.description   = %q{Numo::NArray - New NArray class library in Ruby/Numo.}
+  spec.summary       = %q{alpha release of Numo::NArray - New NArray class library in Ruby/Numo (NUmerical MOdule)}
+  spec.homepage      = "https://github.com/ruby-numo/narray"
+  spec.license       = "BSD-3-Clause"
+  spec.required_ruby_version = '~> 2.1'
+
+  spec.files         = `git ls-files Gemfile README.md Rakefile lib ext numo-narray.gemspec spec`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.extensions    = ["ext/numo/narray/extconf.rb"]
+
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake", "~> 0"
+  spec.add_development_dependency "rspec", "~> 3"
+  spec.add_development_dependency 'rake-compiler', "~> 1.0", ">= 1.0.1"
+  spec.add_development_dependency "rake-compiler-dock", "~> 0"
+end
diff --git a/spec/bit_spec.rb b/spec/bit_spec.rb
new file mode 100644
index 0000000..bc36863
--- /dev/null
+++ b/spec/bit_spec.rb
@@ -0,0 +1,93 @@
+require File.join(File.dirname(__FILE__), "../ext/numo/narray/narray")
+#Numo::NArray.debug = true
+
+RSpec.configure do |config|
+  config.filter_run :focus
+  config.run_all_when_everything_filtered = true
+end
+#context :focus=>true do ... end
+
+dtype = Numo::Bit
+
+describe dtype do
+  it{expect(dtype).to be < Numo::NArray}
+end
+
+procs = [
+  [proc{|tp,a| tp[*a] },""],
+  [proc{|tp,a| tp[*a][true] },"[true]"],
+  [proc{|tp,a| tp[*a][0..-1] },"[0..-1]"]
+]
+procs.each do |init,ref|
+
+  describe dtype,"[0,1,1,0,1,0,0,1]"+ref do
+    before(:all) do
+      @src = [0,1,1,0,1,0,0,1]
+      @n = @src.size
+      @a = init.call(dtype, at src)
+    end
+
+    it{expect(@a).to eq @src}
+    it{expect(@a & 0).to eq [0]*@n}
+    it{expect(@a & 1).to eq @src}
+    it{expect(@a | 0).to eq @src}
+    it{expect(@a | 1).to eq [1]*@n}
+    it{expect(@a ^ 0).to eq @src.map{|x| x^0}}
+    it{expect(@a ^ 1).to eq @src.map{|x| x^1}}
+    it{expect(~@a).to eq @src.map{|x| 1-x}}
+
+    it{expect(@a.count_true).to eq 4}
+    it{expect(@a.count_false).to eq 4}
+    it{expect(@a.where).to eq [1,2,4,7]}
+    it{expect(@a.where2).to eq [[1,2,4,7],[0,3,5,6]]}
+    it{expect(@a.mask(Numo::DFloat[1,2,3,4,5,6,7,8])).to eq [2,3,5,8]}
+    it{expect(@a).not_to be_all}
+    it{expect(@a).to     be_any}
+    it{expect(@a).not_to be_none}
+
+    after(:all) do
+      @a = nil
+    end
+  end
+
+end
+
+procs = [
+  [proc{|tp,a| tp[*a] },""],
+  [proc{|tp,a| tp[*a][true,0..-1] },"[true,true]"],
+]
+procs.each do |init,ref|
+
+  describe dtype,"[[0,1,1,0],[1,0,0,1]]"+ref do
+    before(:all) do
+      @src = [[0,1,1,0],[1,0,0,1]]
+      @n = @src.size
+      @a = init.call(dtype, at src)
+    end
+
+    it{expect(@a[5]).to eq 0}
+    it{expect(@a[-1]).to eq 1}
+    it{expect(@a[1,0]).to eq @src[1][0]}
+    it{expect(@a[1,1]).to eq @src[1][1]}
+    it{expect(@a[1,2]).to eq @src[1][2]}
+    it{expect(@a[3..4]).to eq [0,1]}
+    it{expect(@a[0,1..2]).to eq [1,1]}
+    it{expect(@a[0,:*]).to eq @src[0]}
+    it{expect(@a[1,:*]).to eq @src[1]}
+    it{expect(@a[:*,1]).to eq [@src[0][1], at src[1][1]]}
+
+    it{expect(@a.count_true).to eq 4}
+    it{expect(@a.count_false).to eq 4}
+    it{expect(@a.where).to eq [1,2,4,7]}
+    it{expect(@a.where2).to eq [[1,2,4,7],[0,3,5,6]]}
+    it{expect(@a.mask(Numo::DFloat[[1,2,3,4],[5,6,7,8]])).to eq [2,3,5,8]}
+    it{expect(@a).not_to be_all}
+    it{expect(@a).to     be_any}
+    it{expect(@a).not_to be_none}
+
+    after(:all) do
+      @a = nil
+    end
+  end
+
+end
diff --git a/spec/narray_spec.rb b/spec/narray_spec.rb
new file mode 100644
index 0000000..6fec60e
--- /dev/null
+++ b/spec/narray_spec.rb
@@ -0,0 +1,250 @@
+require File.join(File.dirname(__FILE__), "../ext/numo/narray/narray")
+#Numo::NArray.debug = true
+
+RSpec.configure do |config|
+  config.filter_run :focus
+  config.run_all_when_everything_filtered = true
+end
+#context :focus=>true do ... end
+
+types = [
+  Numo::DFloat,
+  Numo::SFloat,
+  Numo::DComplex,
+  Numo::SComplex,
+  Numo::Int64,
+  Numo::Int32,
+  Numo::Int16,
+  Numo::Int8,
+  Numo::UInt64,
+  Numo::UInt32,
+  Numo::UInt16,
+  Numo::UInt8,
+]
+#types = [Numo::DFloat]
+float_types = [
+  Numo::DFloat,
+  Numo::DComplex,
+]
+
+types.each do |dtype|
+
+  describe dtype  do
+    it{expect(dtype).to be < Numo::NArray}
+  end
+
+  procs = [
+    [proc{|tp,a| tp[*a] },""],
+    [proc{|tp,a| tp[*a][true] },"[true]"],
+    [proc{|tp,a| tp[*a][0..-1] },"[0..-1]"]
+  ]
+  procs.each do |init,ref|
+
+    describe dtype,"[1,2,3,5,7,11]"+ref do
+      before(:all) do
+        @src = [1,2,3,5,7,11]
+        @a = init.call(dtype, at src)
+      end
+      #context :focus=>true do
+
+      it{expect(@a).to be_kind_of dtype}
+      it{expect(@a.size).to eq 6}
+      it{expect(@a.ndim).to eq 1}
+      it{expect(@a.shape).to eq [6]}
+      it{expect(@a).not_to be_inplace}
+      it{expect(@a).to     be_row_major}
+      it{expect(@a).not_to be_column_major}
+      it{expect(@a).to     be_host_order}
+      it{expect(@a).not_to be_byte_swapped}
+      it{expect(@a).to eq [1,2,3,5,7,11]}
+      it{expect(@a.to_a).to eq [1,2,3,5,7,11]}
+      it{expect(@a.to_a).to be_kind_of Array}
+      it{expect(@a.dup).to eq @a}
+      it{expect(@a.clone).to eq @a}
+      it{expect(@a.dup.object_id).not_to eq @a.object_id}
+      it{expect(@a.clone.object_id).not_to eq @a.object_id}
+
+      it{expect(@a.eq([1,1,3,3,7,7])).to eq [1,0,1,0,1,0]}
+      it{expect(@a[3..4]).to eq [5,7]}
+      it{expect(@a[5]).to eq 11}
+      it{expect(@a[-1]).to eq 11}
+      it{expect(@a[[4,3,0,1,5,2]]).to eq [7,5,1,2,11,3]}
+      it{expect(@a.sum).to eq 29}
+      if float_types.include?(dtype)
+        it{expect(@a.mean).to eq 29.0/6}
+        it{expect(@a.var).to eq 13.766666666666669}
+        it{expect(@a.stddev).to eq 3.710345895825168}
+        it{expect(@a.rms).to eq 5.901977069875258}
+      end
+      it{expect(@a.dup.fill(12)).to eq [12]*6}
+      it{expect((@a + 1)).to eq [2,3,4,6,8,12]}
+      it{expect((@a - 1)).to eq [0,1,2,4,6,10]}
+      it{expect((@a * 3)).to eq [3,6,9,15,21,33]}
+      it{expect((@a / 0.5)).to eq [2,4,6,10,14,22]}
+      it{expect((- at a)).to eq [-1,-2,-3,-5,-7,-11]}
+      it{expect((@a ** 2)).to eq [1,4,9,25,49,121]}
+      it{expect(@a.swap_byte.swap_byte).to eq [1,2,3,5,7,11]}
+      if dtype == Numo::DComplex || dtype == Numo::SComplex
+        it{expect(@a.real).to eq @src}
+        it{expect(@a.imag).to eq [0]*6}
+        it{expect(@a.conj).to eq @src}
+        it{expect(@a.angle).to eq [0]*6}
+      else
+        it{expect(@a.min).to eq 1}
+        it{expect(@a.max).to eq 11}
+        it{expect((@a >= 3)).to eq [0,0,1,1,1,1]}
+        it{expect((@a >  3)).to eq [0,0,0,1,1,1]}
+        it{expect((@a <= 3)).to eq [1,1,1,0,0,0]}
+        it{expect((@a <  3)).to eq [1,1,0,0,0,0]}
+        it{expect((@a.eq 3)).to eq [0,0,1,0,0,0]}
+        it{expect(@a.sort).to eq @src}
+        it{expect(@a.sort_index).to eq (0..5).to_a}
+        it{expect(@a.median).to eq 4}
+      end
+    end
+  end
+
+  describe dtype, '[1..4]' do
+    it{expect(dtype[1..4]).to eq [1,2,3,4]}
+  end
+
+  #describe dtype, ".seq(5)" do
+  #  it do
+  #    dtype.seq(5).should == [0,1,2,3,4]
+  #  end
+  #end
+
+  procs2 = [
+    [proc{|tp,src| tp[*src] },""],
+    [proc{|tp,src| tp[*src][true,true] },"[true,true]"],
+    [proc{|tp,src| tp[*src][0..-1,0..-1] },"[0..-1,0..-1]"]
+  ]
+  procs2.each do |init,ref|
+
+    describe dtype,'[[1,2,3],[5,7,11]]'+ref do
+      before(:all) do
+        @src = [[1,2,3],[5,7,11]]
+        @a = init.call(dtype, at src)
+      end
+      #context :focus=>true do
+
+      it{expect(@a).to be_kind_of dtype}
+      it{expect(@a.size).to eq 6}
+      it{expect(@a.ndim).to eq 2}
+      it{expect(@a.shape).to eq [2,3]}
+      it{expect(@a).not_to be_inplace}
+      it{expect(@a).to     be_row_major}
+      it{expect(@a).not_to be_column_major}
+      it{expect(@a).to     be_host_order}
+      it{expect(@a).not_to be_byte_swapped}
+      it{expect(@a).to eq @src}
+      it{expect(@a.to_a).to eq @src}
+      it{expect(@a.to_a).to be_kind_of Array}
+
+      it{expect(@a.eq([[1,1,3],[3,7,7]])).to eq [[1,0,1],[0,1,0]]}
+      it{expect(@a[5]).to eq 11}
+      it{expect(@a[-1]).to eq 11}
+      it{expect(@a[1,0]).to eq @src[1][0]}
+      it{expect(@a[1,1]).to eq @src[1][1]}
+      it{expect(@a[1,2]).to eq @src[1][2]}
+      it{expect(@a[3..4]).to eq [5,7]}
+      it{expect(@a[0,1..2]).to eq [2,3]}
+      it{expect(@a[0,:*]).to eq @src[0]}
+      it{expect(@a[1,:*]).to eq @src[1]}
+      it{expect(@a[:*,1]).to eq [@src[0][1], at src[1][1]]}
+      it{expect(@a[true,[2,0,1]]).to eq [[3,1,2],[11,5,7]]}
+      it{expect(@a.reshape(3,2)).to eq [[1,2],[3,5],[7,11]]}
+      it{expect(@a.reshape(3,nil)).to eq [[1,2],[3,5],[7,11]]}
+      it{expect(@a.reshape(nil,2)).to eq [[1,2],[3,5],[7,11]]}
+      it{expect(@a.transpose).to eq [[1,5],[2,7],[3,11]]}
+      it{expect(@a.transpose(1,0)).to eq [[1,5],[2,7],[3,11]]}
+
+      it{expect(@a.sum).to eq 29}
+      it{expect(@a.sum(0)).to eq [6, 9, 14]}
+      it{expect(@a.sum(1)).to eq [6, 23]}
+      if float_types.include?(dtype)
+        it{expect(@a.mean).to eq 29.0/6}
+        it{expect(@a.mean(0)).to eq [3, 4.5, 7]}
+        it{expect(@a.mean(1)).to eq [2, 23.0/3]}
+      end
+      if dtype == Numo::DComplex || dtype == Numo::SComplex
+        it{expect(@a.real).to eq @src}
+        it{expect(@a.imag).to eq [[0]*3]*2}
+        it{expect(@a.conj).to eq @src}
+        it{expect(@a.angle).to eq [[0]*3]*2}
+      else
+        it{expect(@a.min).to eq 1}
+        it{expect(@a.max).to eq 11}
+        it{expect((@a >= 3)).to eq [[0,0,1],[1,1,1]]}
+        it{expect((@a >  3)).to eq [[0,0,0],[1,1,1]]}
+        it{expect((@a <= 3)).to eq [[1,1,1],[0,0,0]]}
+        it{expect((@a <  3)).to eq [[1,1,0],[0,0,0]]}
+        it{expect((@a.eq 3)).to eq [[0,0,1],[0,0,0]]}
+        it{expect(@a.sort).to eq @src}
+        it{expect(@a.sort_index).to eq [[0,1,2],[3,4,5]]}
+      end
+      it{expect(@a.dup.fill(12)).to eq [[12]*3]*2}
+      it{expect((@a + 1)).to eq [[2,3,4],[6,8,12]]}
+      it{expect((@a + [1,2,3])).to eq [[2,4,6],[6,9,14]]}
+      it{expect((@a - 1)).to eq [[0,1,2],[4,6,10]]}
+      it{expect((@a - [1,2,3])).to eq [[0,0,0],[4,5,8]]}
+      it{expect((@a * 3)).to eq [[3,6,9],[15,21,33]]}
+      it{expect((@a * [1,2,3])).to eq [[1,4,9],[5,14,33]]}
+      it{expect((@a / 0.5)).to eq [[2,4,6],[10,14,22]]}
+      it{expect((- at a)).to eq [[-1,-2,-3],[-5,-7,-11]]}
+      it{expect((@a ** 2)).to eq [[1,4,9],[25,49,121]]}
+      it{expect((dtype[[1,0],[0,1]].dot dtype[[4,1],[2,2]])).to eq [[4,1],[2,2]]}
+      it{expect(@a.swap_byte.swap_byte).to eq @src}
+    end
+
+  end
+
+  describe dtype,"[[[1,2],[3,4]],[[5,6],[7,8]]]" do
+    before do
+      @a = dtype[[[1,2],[3,4]],[[5,6],[7,8]]]
+    end
+
+    it{expect(@a[0, 1, 1]).to eq 4}
+    it{expect(@a[:rest]).to eq @a}
+    it{expect(@a[0, :rest]).to eq [[1,2],[3,4]]}
+    it{expect(@a[0, false]).to eq [[1,2],[3,4]]}
+    it{expect(@a[0, 1, :rest]).to eq [3,4]}
+    it{expect(@a[0, 1, false]).to eq [3,4]}
+    it{expect(@a[:rest, 0]).to eq [[1,3],[5,7]]}
+    it{expect(@a[:rest, 0, 1]).to eq [2,6]}
+    it{expect(@a[1, :rest, 0]).to eq [5,7]}
+    it{expect(@a[1, 1, :rest, 0]).to eq 7}
+    it{expect{@a[1, 1, 1, 1, :rest]}.to raise_error IndexError}
+    it{expect{@a[1, 1, 1, :rest, 1]}.to raise_error IndexError}
+    it{expect{@a[:rest, 1, :rest, 0]}.to raise_error IndexError}
+  end
+
+  describe dtype, "#dot" do
+    it "vector.dot(vector)" do
+      a = dtype[1..3]
+      b = dtype[2..4]
+      expect(a.dot(b)).to eq (1*2 + 2*3 + 3*4)
+    end
+    it "matrix.dot(vector)" do
+      a = dtype[1..6].reshape(3,2)
+      b = dtype[1..2]
+      expect(a.dot(b)).to eq [5, 11, 17]
+    end
+    it "vector.dot(matrix)" do
+      a = dtype[1..2]
+      b = dtype[1..6].reshape(2,3)
+      expect(a.dot(b)).to eq [9, 12, 15]
+    end
+    it "matrix.dot(matrix)" do
+      a = dtype[1..6].reshape(3,2)
+      b = dtype[1..6].reshape(2,3)
+      expect(a.dot(b)).to eq [[9, 12, 15], [19, 26, 33], [29, 40, 51]]
+      expect(b.dot(a)).to eq [[22, 28], [49, 64]]
+    end
+    it "matrix.dot(matrix) with incorrect shape" do
+      a = dtype[1..6].reshape(3,2)
+      b = dtype[1..9].reshape(3,3)
+      expect{a.dot(b)}.to raise_error(Numo::NArray::ShapeError)
+    end
+  end
+end

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ruby-extras/ruby-numo-narray.git