[h5py] 103/455: Flat indexing, dataset len/iter, datasets can now be extended
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Jul 2 18:19:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to annotated tag 1.3.0
in repository h5py.
commit b60dd48111c222217200b42d19334099b6b5b379
Author: andrewcollette <andrew.collette at gmail.com>
Date: Wed Aug 20 03:39:49 2008 +0000
Flat indexing, dataset len/iter, datasets can now be extended
---
h5py/highlevel.py | 110 ++++++++++++++++++++++++++++++++++++++++---
h5py/tests/test_highlevel.py | 105 ++++++++++++++++++++++++++++++++++++++++-
h5py/utils_hl.py | 37 ++++++++++++++-
3 files changed, 242 insertions(+), 10 deletions(-)
diff --git a/h5py/highlevel.py b/h5py/highlevel.py
index c8b773d..1b987e3 100644
--- a/h5py/highlevel.py
+++ b/h5py/highlevel.py
@@ -53,7 +53,7 @@ import threading
from h5py import h5, h5f, h5g, h5s, h5t, h5d, h5a, h5p, h5z, h5i, config
from h5py.h5 import H5Error
-from utils_hl import slice_select, hbasename, strhdr, strlist
+from utils_hl import slice_select, hbasename, strhdr, strlist, FlatIndexer
from browse import _H5Browser
@@ -406,6 +406,63 @@ class File(Group):
readline.add_history(x)
self._path = browser.path
+class FlatIndexProxy(object):
+
+ """
+ Utility class which allows 1-D indexing of datasets.
+
+ These come attached to Dataset objects as <obj>.flat. They behave
+ like 1-D arrays; you can slice into them and assign to slices like
+ NumPy flatiter objects. However, they are not iterable.
+
+ In addition to single indices and slices, you can also provide an
+ iterable which yields indices and slices. The returned array will
+ be the union of these selections, in the order they were presented,
+ with duplicate entries skipped.
+
+ Examples: (let dset be of shape (10,10))
+ >>> dset.flat[10] # Equivalent to dset[1,0]
+ >>> dset.flat[5:15] # Note you can't do this with dset[x,y]
+ >>> dset.flat[0,1,3,2] # First 4 elements, in the specified order
+
+ Caveats: At the HDF5 level, this works by explicitly listing the set
+ of points to be accessed. For large, regularly strided selections,
+ you should use the standard n-D slicing syntax, which is significantly
+ faster.
+ """
+
+ def __init__(self, dset):
+ self._dset = dset
+
+ def __getitem__(self, args):
+ """ Read from the dataset, treating it as a 1-D (C-contiguous) array.
+
+ Allowed slicing mechanisms:
+ 1. Ints/longs
+ 2. Extended slices
+ 3. Sequences of ints/extended slices (e.g. flat[0,1,2])
+
+ Subsets which result in a single element are returned as scalars.
+ """
+ indexer = FlatIndexer(self._dset.shape, args)
+ arr = self._dset[indexer]
+
+ # These match the way NumPy behaves
+ if arr.shape == ():
+ return numpy.asscalar(arr)
+ return arr.newbyteorder('=')
+
+ def __setitem__(self, args, val):
+ """ Write to the dataset, treating it as a 1-D (C-contiguous) array.
+
+ Allowed slicing mechanisms:
+ 1. Ints/longs
+ 2. Extended slices
+ 3. Sequences of ints/extended slices (e.g. flat[0,1,2])
+ """
+ indexer = FlatIndexer(self._dset.shape, args)
+ self._dset[indexer] = val
+
class Dataset(HLObject):
""" High-level interface to an HDF5 dataset.
@@ -430,6 +487,9 @@ class Dataset(HLObject):
dtype = property(lambda self: self.id.dtype,
doc = "Numpy dtype representing the datatype")
+ flat = property(lambda self: FlatIndexProxy(self),
+ doc = "1-D read/write slicing access to the dataset. Not iterable.")
+
def _getval(self):
with self._lock:
arr = self[...]
@@ -442,7 +502,8 @@ class Dataset(HLObject):
def __init__(self, group, name,
shape=None, dtype=None, data=None,
- chunks=None, compression=None, shuffle=False, fletcher32=False):
+ chunks=None, compression=None, shuffle=False,
+ fletcher32=False, maxshape=None):
""" Construct a Dataset object. You might find it easier to use the
Group methods: Group["name"] or Group.create_dataset().
@@ -469,6 +530,11 @@ class Dataset(HLObject):
compression: DEFLATE (gzip) compression level, int or None*
shuffle: Use the shuffle filter? (requires compression) T/F*
fletcher32: Enable Fletcher32 error detection? T/F*
+
+ maxshape: Tuple giving dataset maximum dimensions. You can
+ grow each axis up to this limit using extend(). For
+ an unlimited axis, provide None. Can only be used
+ with chunks.
"""
with group._lock:
if data is None and shape is None:
@@ -501,7 +567,10 @@ class Dataset(HLObject):
if fletcher32:
plist.set_fletcher32()
- space_id = h5s.create_simple(shape)
+ if maxshape is not None:
+ maxshape = tuple(x if x is not None else h5s.UNLIMITED for x in maxshape)
+
+ space_id = h5s.create_simple(shape, maxshape)
type_id = h5t.py_create(dtype)
self.id = h5d.create(group.id, name, type_id, space_id, plist)
@@ -510,6 +579,34 @@ class Dataset(HLObject):
self._attrs = AttributeManager(self)
+ def extend(self, shape):
+ """ Resize the dataset so it's at least as big as "shape".
+
+ Note that the new shape must be compatible with the "maxshape"
+ argument provided when the dataset was created. Also, the rank of
+ the dataset cannot be changed.
+ """
+ with self._lock:
+ self.id.extend(shape)
+
+ def __len__(self):
+ """ The size of the first axis. TypeError if scalar.
+ """
+ shape = self.shape
+ if len(shape) == 0:
+ raise TypeError("Attempt to take len() of scalar dataset")
+ return shape[0]
+
+ def __iter__(self):
+ """ Iterate over the first axis. TypeError if scalar. Modifications
+ to the yielded data are *NOT* recorded.
+ """
+ shape = self.shape
+ if len(shape) == 0:
+ raise TypeError("Can't iterate over a scalar dataset")
+ for i in xrange(shape[0]):
+ yield self[i]
+
def __getitem__(self, args):
""" Read a slice from the HDF5 dataset. Takes slices and
recarray-style field names (more than one is allowed!) in any
@@ -560,12 +657,11 @@ class Dataset(HLObject):
if len(names) == 1:
# Match Numpy convention for recarray indexing
- return arr[names[0]].squeeze()
+ arr = arr[names[0]]
return arr.squeeze()
-
def __setitem__(self, args, val):
- """ Write to the HDF5 dataset from an Numpy array. The shape of the
+ """ Write to the HDF5 dataset from a Numpy array. The shape of the
Numpy array must match the shape of the selection, and the Numpy
array's datatype must be convertible to the HDF5 datatype.
"""
@@ -652,7 +748,7 @@ class AttributeManager(LockableObject):
def __setitem__(self, name, value):
""" Set the value of an attribute, overwriting any previous value.
The value you provide must be convertible to a Numpy array or
- scalar. If it's not, the action is aborted with no data loss.
+ scalar.
Any existing value is destroyed just before the call to h5a.create.
If the creation fails, the data is not recoverable.
diff --git a/h5py/tests/test_highlevel.py b/h5py/tests/test_highlevel.py
index 320cf62..5236d47 100644
--- a/h5py/tests/test_highlevel.py
+++ b/h5py/tests/test_highlevel.py
@@ -196,6 +196,57 @@ class TestDataset(unittest.TestCase):
self.assert_(numpy.all(d.value == srcarr))
del self.f["NewDataset"]
+ def test_Dataset_extend(self):
+
+ print ""
+
+ init_shapes = [(100,), (100,100), (150,100)]
+ max_shapes = [(200,), (200,200), (None, 100)]
+ chunks = [(10,), (10,10), (10,10)]
+
+ final_shapes = {(100,): [ (150,), (200,) ],
+ (100,100): [(200,100), (200,200)],
+ (150,100): [ (200,100), (300,100), (500,100)] }
+
+ illegal_shapes = {(100,): [(250,)], (100,100): [(250,100), (250,250)],
+ (150,100): [(200,150)] }
+
+ for shape, maxshape, chunk in zip(init_shapes, max_shapes, chunks):
+ srcarr = numpy.arange(numpy.product(shape)).reshape(shape)
+ if "DS" in self.f:
+ del self.f["DS"]
+ ds = self.f.create_dataset("DS", data=srcarr, maxshape=maxshape, chunks=chunk)
+
+ self.assertEqual(ds.shape, shape)
+
+ for final_shape in final_shapes[shape]:
+ print " Extending %s to %s" % (shape, final_shape)
+ newarr = numpy.arange(numpy.product(final_shape)).reshape(final_shape)
+ ds.extend(final_shape)
+ ds[...] = newarr
+ self.assertEqual(ds.shape, final_shape)
+ self.assert_(numpy.all(ds[...] == newarr))
+
+ for illegal_shape in illegal_shapes[shape]:
+ self.assertRaises(H5Error, ds.extend, illegal_shape)
+
+ def test_Dataset_len_iter(self):
+
+ arr1 = numpy.arange(100).reshape((10,10))
+ arr2 = numpy.ones(())
+
+ d1 = self.f.create_dataset("D1", data=arr1)
+ d2 = self.f.create_dataset("D2", data=arr2)
+
+ self.assertEqual(len(arr1), len(d1))
+ self.assertRaises(TypeError, d2, len)
+
+ for idx, (hval, nval) in enumerate(zip(d1, arr1)):
+ self.assert_(numpy.all(hval == nval))
+
+ self.assertEqual(idx+1, len(arr1))
+ self.assertRaises(TypeError, list, d2)
+
def test_Dataset_slicing(self):
print ''
@@ -207,7 +258,7 @@ class TestDataset(unittest.TestCase):
slices += [ s[0:7:2,0:9:3,15:43:5], s[2:8:2,...] ]
slices += [ s[0], s[1], s[9], s[:] ] # Numpy convention
slices += [ numpy.random.random((10,10,50)) > 0.5 ] # Truth array
-
+
for dt in TYPES1:
srcarr = numpy.arange(10*10*50, dtype=dt).reshape(10,10,50)
@@ -242,6 +293,58 @@ class TestDataset(unittest.TestCase):
f.close()
os.unlink(fname)
+ def test_Dataset_flat(self):
+
+ print ""
+
+ s = SliceFreezer()
+
+ flatindexing = [0, 1, 45, 355]
+ flatindexing += [s[0:500:2], s[0:644], s[3:99], s[35:655:3]]
+ flatindexing += [s[:45:], s[::3], s[:78:4]]
+
+ extended = [ (0,1,3,2) ]
+
+ for dt in TYPES1:
+
+ srcarr = numpy.arange(10*10*20, dtype=dt).reshape(10,10,20)
+ srcarr = srcarr + numpy.sin(srcarr)
+
+ fname = tempfile.mktemp('.hdf5')
+ f = File(fname, 'w')
+ try:
+ d = Dataset(f, "NewDataset", data=srcarr)
+ self.assertEqual(d.shape, srcarr.shape)
+ self.assertEqual(d.dtype, srcarr.dtype)
+ for idx in flatindexing:
+ print " Checking flat read %.20s %s" % (dt, idx)
+ hresult = d.flat[idx]
+ nresult = srcarr.flat[idx]
+ if hasattr(hresult, 'shape'):
+ self.assertEqual(hresult.shape, nresult.shape)
+ self.assertEqual(hresult.dtype, nresult.dtype)
+ self.assert_(numpy.all(hresult == nresult), "%s\n%s" % (hresult, nresult))
+ else:
+ self.assertEqual(hresult, numpy.asscalar(nresult))
+
+ del f["NewDataset"]
+ d = Dataset(f, "NewDataset", data=srcarr)
+ for idx in flatindexing:
+ print " Checking flat write %.20s %s" % (dt, idx)
+ srcarr.flat[idx] = numpy.cos(srcarr.flat[idx])
+ d.flat[idx] = srcarr.flat[idx]
+ self.assert_(numpy.all(d.value == srcarr))
+
+ del f["NewDataset"]
+ d = Dataset(f, "NewDataset", data=srcarr)
+ for seq in extended:
+ subset = d.flat[seq]
+ for idx, entry in enumerate(seq):
+ self.assertEqual(subset[idx], srcarr.flat[entry])
+ finally:
+ f.close()
+ os.unlink(fname)
+
def test_Dataset_exceptions(self):
# These trigger exceptions in H5Dread
ref = numpy.ones((10,10), dtype='<i4')
diff --git a/h5py/utils_hl.py b/h5py/utils_hl.py
index daef943..7011daa 100644
--- a/h5py/utils_hl.py
+++ b/h5py/utils_hl.py
@@ -16,6 +16,33 @@ def hbasename(name):
bname = '/'
return bname
+class FlatIndexer(object):
+
+ """
+ Utility class which encapsulates a 1-D selection into an n-D array.
+
+ """
+
+ def __init__(self, shape, args):
+ """ Shape must be a tuple; args must be iterable.
+ """
+ try:
+ args = iter(args)
+ except TypeError:
+ args = (args,)
+
+ points = []
+
+ for arg in args:
+ if isinstance(arg, slice):
+ points.extend(xrange(*arg.indices(numpy.product(shape))))
+ elif isinstance(arg, int) or isinstance(arg, long):
+ points.append(arg)
+ else:
+ raise ValueError("Illegal index (ints, longs or slices only)")
+
+ self.coords = numpy.array([numpy.unravel_index(x, shape) for x in points])
+
def slice_select(space, args):
""" Perform a selection on the given HDF5 dataspace, using a tuple
of Python extended slice objects. The dataspace may be scalar or
@@ -26,8 +53,9 @@ def slice_select(space, args):
1-tuple:
1. A single Ellipsis: entire dataspace selected
- 2. A NumPy array: element-wise selection
- 3. A single integer or slice (row-broadcasting)
+ 2. A single integer or slice (row-broadcasting)
+ 3. A NumPy array: element-wise selection
+ 4. A FlatIndexer instance containing a coordinate list
n-tuple:
1. slice objects
@@ -50,6 +78,11 @@ def slice_select(space, args):
space.select_elements(indices)
return h5s.create_simple((len(indices),))
+ if isinstance(argval, FlatIndexer):
+ space.select_elements(argval.coords)
+ npoints = space.get_select_elem_npoints()
+ return h5s.create_simple((npoints,))
+
# Single-index obj[0] access is always equivalent to obj[0,...].
# Pack it back up and send it to the hyperslab machinery
args = (argval, Ellipsis)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/h5py.git
More information about the debian-science-commits
mailing list