[h5py] 104/455: Automatic chunking for Dataset constructor

Thu Jul 2 18:19:22 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to annotated tag 1.3.0
in repository h5py.

commit db54a694c470048a500126e3e71644fe6d64190d
Author: andrewcollette <andrew.collette at gmail.com>
Date:   Thu Aug 21 03:54:12 2008 +0000

    Automatic chunking for Dataset constructor
---
 h5py/highlevel.py            | 20 ++++++++++++++-----
 h5py/tests/test_highlevel.py |  6 +++++-
 h5py/utils_hl.py             | 46 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/h5py/highlevel.py b/h5py/highlevel.py
index 1b987e3..13d49a6 100644
--- a/h5py/highlevel.py
+++ b/h5py/highlevel.py
@@ -53,7 +53,8 @@ import threading
 
 from h5py import h5, h5f, h5g, h5s, h5t, h5d, h5a, h5p, h5z, h5i, config
 from h5py.h5 import H5Error
-from utils_hl import slice_select, hbasename, strhdr, strlist, FlatIndexer
+from utils_hl import slice_select, hbasename, strhdr, strlist, FlatIndexer, \
+                     guess_chunk
 from browse import _H5Browser
 
 
@@ -524,7 +525,7 @@ class Dataset(HLObject):
             exists.  Also, chunks/compression/shuffle/fletcher32 may only be
             specified when creating a dataset.
 
-            Creation keywords (* is default); "chunks" is required for all:
+            Creation keywords (* is default):
 
             chunks:        Tuple of chunk dimensions or None*
             compression:   DEFLATE (gzip) compression level, int or None*
@@ -533,8 +534,11 @@ class Dataset(HLObject):
 
             maxshape:      Tuple giving dataset maximum dimensions.  You can
                            grow each axis up to this limit using extend(). For
-                           an unlimited axis, provide None.  Can only be used
-                           with chunks.
+                           an unlimited axis, provide None.  Requires chunks.
+
+            All these options require chunking.  If a chunk tuple is not
+            provided, the constructor will guess an appropriate chunk shape.
+            Please note none of these are allowed for scalar datasets.
         """
         with group._lock:
             if data is None and shape is None:
@@ -555,8 +559,14 @@ class Dataset(HLObject):
                 
                 dtype = numpy.dtype(dtype)
 
+                if any((compression, shuffle, fletcher32, maxshape)) and chunks is None:
+                    chunks = guess_chunk(shape, dtype.itemsize)
+
+                if chunks is not None and shape == ():
+                    raise ValueError("Filter options cannot be used with scalar datasets.")
+
                 plist = h5p.create(h5p.DATASET_CREATE)
-                if chunks:
+                if chunks is not None:
                     plist.set_chunk(chunks)
                 if shuffle:
                     plist.set_shuffle()
diff --git a/h5py/tests/test_highlevel.py b/h5py/tests/test_highlevel.py
index 5236d47..59f3cfd 100644
--- a/h5py/tests/test_highlevel.py
+++ b/h5py/tests/test_highlevel.py
@@ -170,6 +170,10 @@ class TestDataset(unittest.TestCase):
         shapes = [(), (1,), (10,5), (1,10), (10,1), (100,1,100), (51,2,1025)]
         chunks = [None, (1,), (10,1), (1,1),  (1,1),  (50,1,100), (51,2,25)]
 
+        # Test auto-chunk creation for each
+        shapes += shapes
+        chunks += [None]*len(chunks)
+
         for shape, chunk in zip(shapes, chunks):
             for dt in TYPES:
                 print "    Creating %.20s %.40s" % (shape, dt)
@@ -179,7 +183,7 @@ class TestDataset(unittest.TestCase):
                 self.assertEqual(d.dtype, dt)
                 del self.f["NewDataset"]
 
-                if chunk is not None:
+                if shape != ():
                     print "        With chunk %s" % (chunk,)
                     d = Dataset(self.f, "NewDataset", dtype=dt, shape=shape,
                                 chunks=chunk, shuffle=True, compression=6,
diff --git a/h5py/utils_hl.py b/h5py/utils_hl.py
index 7011daa..b7c56bb 100644
--- a/h5py/utils_hl.py
+++ b/h5py/utils_hl.py
@@ -8,6 +8,10 @@ from h5py import h5s
 from posixpath import basename, normpath
 import numpy
 
+CHUNK_BASE = 16*1024    # Multiplier by which chunks are adjusted
+MIN_CHUNK = 8*1024      # Soft lower limit (8k)
+MAX_CHUNK = 1024*1024   # Hard upper limit (1M)
+
 def hbasename(name):
     """ Basename function with more readable handling of trailing slashes"""
     bname = normpath(name)
@@ -16,6 +20,48 @@ def hbasename(name):
         bname = '/'
     return bname
 
+def guess_chunk(shape, typesize):
+    """ Guess an appropriate chunk layout for a dataset, given its shape and
+        the size of each element in bytes.  Will allocate chunks only as large
+        as MAX_SIZE.  Chunks are generally close to some power-of-2 fraction of
+        each axis, slightly favoring bigger values for the last index.
+    """
+
+    ndims = len(shape)
+    if ndims == 0:
+        raise ValueError("Chunks not allowed for scalar datasets.")
+
+    chunks = numpy.array(shape, dtype='=f8')
+
+    # Determine the optimal chunk size in bytes using a PyTables expression.
+    # This is kept as a float.
+    dset_size = numpy.product(chunks)*typesize
+    target_size = CHUNK_BASE * (2**numpy.log10(dset_size/(1024.*1024)))
+
+    if target_size > MAX_CHUNK:
+        target_size = MAX_CHUNK
+    elif target_size < MIN_CHUNK:
+        target_size = MIN_CHUNK
+
+    idx = 0
+    while True:
+        # Repeatedly loop over the axes, dividing them by 2.  Stop when:
+        # 1a. We're smaller than the target chunk size, OR
+        # 1b. We're within 50% of the target chunk size, AND
+        #  2. The chunk is smaller than the maximum chunk size
+
+        chunk_bytes = numpy.product(chunks)*typesize
+
+        if (chunk_bytes < target_size or \
+         abs(chunk_bytes-target_size)/target_size < 0.5) and \
+         chunk_bytes < MAX_CHUNK:
+            break
+
+        chunks[idx%ndims] = numpy.ceil(chunks[idx%ndims] / 2.0)
+        idx += 1
+
+    return tuple(long(x) for x in chunks)
+
 class FlatIndexer(object):
 
     """

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/h5py.git