[h5py] 104/455: Automatic chunking for Dataset constructor
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Jul 2 18:19:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to annotated tag 1.3.0
in repository h5py.
commit db54a694c470048a500126e3e71644fe6d64190d
Author: andrewcollette <andrew.collette at gmail.com>
Date: Thu Aug 21 03:54:12 2008 +0000
Automatic chunking for Dataset constructor
---
h5py/highlevel.py | 20 ++++++++++++++-----
h5py/tests/test_highlevel.py | 6 +++++-
h5py/utils_hl.py | 46 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 66 insertions(+), 6 deletions(-)
diff --git a/h5py/highlevel.py b/h5py/highlevel.py
index 1b987e3..13d49a6 100644
--- a/h5py/highlevel.py
+++ b/h5py/highlevel.py
@@ -53,7 +53,8 @@ import threading
from h5py import h5, h5f, h5g, h5s, h5t, h5d, h5a, h5p, h5z, h5i, config
from h5py.h5 import H5Error
-from utils_hl import slice_select, hbasename, strhdr, strlist, FlatIndexer
+from utils_hl import slice_select, hbasename, strhdr, strlist, FlatIndexer, \
+ guess_chunk
from browse import _H5Browser
@@ -524,7 +525,7 @@ class Dataset(HLObject):
exists. Also, chunks/compression/shuffle/fletcher32 may only be
specified when creating a dataset.
- Creation keywords (* is default); "chunks" is required for all:
+ Creation keywords (* is default):
chunks: Tuple of chunk dimensions or None*
compression: DEFLATE (gzip) compression level, int or None*
@@ -533,8 +534,11 @@ class Dataset(HLObject):
maxshape: Tuple giving dataset maximum dimensions. You can
grow each axis up to this limit using extend(). For
- an unlimited axis, provide None. Can only be used
- with chunks.
+ an unlimited axis, provide None. Requires chunks.
+
+ All these options require chunking. If a chunk tuple is not
+ provided, the constructor will guess an appropriate chunk shape.
+ Please note none of these are allowed for scalar datasets.
"""
with group._lock:
if data is None and shape is None:
@@ -555,8 +559,14 @@ class Dataset(HLObject):
dtype = numpy.dtype(dtype)
+ if any((compression, shuffle, fletcher32, maxshape)) and chunks is None:
+ chunks = guess_chunk(shape, dtype.itemsize)
+
+ if chunks is not None and shape == ():
+ raise ValueError("Filter options cannot be used with scalar datasets.")
+
plist = h5p.create(h5p.DATASET_CREATE)
- if chunks:
+ if chunks is not None:
plist.set_chunk(chunks)
if shuffle:
plist.set_shuffle()
diff --git a/h5py/tests/test_highlevel.py b/h5py/tests/test_highlevel.py
index 5236d47..59f3cfd 100644
--- a/h5py/tests/test_highlevel.py
+++ b/h5py/tests/test_highlevel.py
@@ -170,6 +170,10 @@ class TestDataset(unittest.TestCase):
shapes = [(), (1,), (10,5), (1,10), (10,1), (100,1,100), (51,2,1025)]
chunks = [None, (1,), (10,1), (1,1), (1,1), (50,1,100), (51,2,25)]
+ # Test auto-chunk creation for each
+ shapes += shapes
+ chunks += [None]*len(chunks)
+
for shape, chunk in zip(shapes, chunks):
for dt in TYPES:
print " Creating %.20s %.40s" % (shape, dt)
@@ -179,7 +183,7 @@ class TestDataset(unittest.TestCase):
self.assertEqual(d.dtype, dt)
del self.f["NewDataset"]
- if chunk is not None:
+ if shape != ():
print " With chunk %s" % (chunk,)
d = Dataset(self.f, "NewDataset", dtype=dt, shape=shape,
chunks=chunk, shuffle=True, compression=6,
diff --git a/h5py/utils_hl.py b/h5py/utils_hl.py
index 7011daa..b7c56bb 100644
--- a/h5py/utils_hl.py
+++ b/h5py/utils_hl.py
@@ -8,6 +8,10 @@ from h5py import h5s
from posixpath import basename, normpath
import numpy
+CHUNK_BASE = 16*1024 # Multiplier by which chunks are adjusted
+MIN_CHUNK = 8*1024 # Soft lower limit (8k)
+MAX_CHUNK = 1024*1024 # Hard upper limit (1M)
+
def hbasename(name):
""" Basename function with more readable handling of trailing slashes"""
bname = normpath(name)
@@ -16,6 +20,48 @@ def hbasename(name):
bname = '/'
return bname
+def guess_chunk(shape, typesize):
+ """ Guess an appropriate chunk layout for a dataset, given its shape and
+ the size of each element in bytes. Will allocate chunks only as large
+ as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of
+ each axis, slightly favoring bigger values for the last index.
+ """
+
+ ndims = len(shape)
+ if ndims == 0:
+ raise ValueError("Chunks not allowed for scalar datasets.")
+
+ chunks = numpy.array(shape, dtype='=f8')
+
+ # Determine the optimal chunk size in bytes using a PyTables expression.
+ # This is kept as a float.
+ dset_size = numpy.product(chunks)*typesize
+ target_size = CHUNK_BASE * (2**numpy.log10(dset_size/(1024.*1024)))
+
+ if target_size > MAX_CHUNK:
+ target_size = MAX_CHUNK
+ elif target_size < MIN_CHUNK:
+ target_size = MIN_CHUNK
+
+ idx = 0
+ while True:
+ # Repeatedly loop over the axes, dividing them by 2. Stop when:
+ # 1a. We're smaller than the target chunk size, OR
+ # 1b. We're within 50% of the target chunk size, AND
+ # 2. The chunk is smaller than the maximum chunk size
+
+ chunk_bytes = numpy.product(chunks)*typesize
+
+ if (chunk_bytes < target_size or \
+ abs(chunk_bytes-target_size)/target_size < 0.5) and \
+ chunk_bytes < MAX_CHUNK:
+ break
+
+ chunks[idx%ndims] = numpy.ceil(chunks[idx%ndims] / 2.0)
+ idx += 1
+
+ return tuple(long(x) for x in chunks)
+
class FlatIndexer(object):
"""
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/h5py.git
More information about the debian-science-commits
mailing list