[h5py] 103/455: Flat indexing, dataset len/iter, datasets can now be extended

Thu Jul 2 18:19:22 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to annotated tag 1.3.0
in repository h5py.

commit b60dd48111c222217200b42d19334099b6b5b379
Author: andrewcollette <andrew.collette at gmail.com>
Date:   Wed Aug 20 03:39:49 2008 +0000

    Flat indexing, dataset len/iter, datasets can now be extended
---
 h5py/highlevel.py            | 110 ++++++++++++++++++++++++++++++++++++++++---
 h5py/tests/test_highlevel.py | 105 ++++++++++++++++++++++++++++++++++++++++-
 h5py/utils_hl.py             |  37 ++++++++++++++-
 3 files changed, 242 insertions(+), 10 deletions(-)

diff --git a/h5py/highlevel.py b/h5py/highlevel.py
index c8b773d..1b987e3 100644
--- a/h5py/highlevel.py
+++ b/h5py/highlevel.py
@@ -53,7 +53,7 @@ import threading
 
 from h5py import h5, h5f, h5g, h5s, h5t, h5d, h5a, h5p, h5z, h5i, config
 from h5py.h5 import H5Error
-from utils_hl import slice_select, hbasename, strhdr, strlist
+from utils_hl import slice_select, hbasename, strhdr, strlist, FlatIndexer
 from browse import _H5Browser
 
 
@@ -406,6 +406,63 @@ class File(Group):
                     readline.add_history(x)
         self._path = browser.path
 
+class FlatIndexProxy(object):
+
+    """
+        Utility class which allows 1-D indexing of datasets.
+
+        These come attached to Dataset objects as <obj>.flat.  They behave
+        like 1-D arrays; you can slice into them and assign to slices like
+        NumPy flatiter objects.  However, they are not iterable.
+
+        In addition to single indices and slices, you can also provide an
+        iterable which yields indices and slices.  The returned array will
+        be the union of these selections, in the order they were presented,
+        with duplicate entries skipped.
+
+        Examples:  (let dset be of shape (10,10))
+            >>> dset.flat[10]       # Equivalent to dset[1,0]
+            >>> dset.flat[5:15]     # Note you can't do this with dset[x,y]
+            >>> dset.flat[0,1,3,2]  # First 4 elements, in the specified order
+
+        Caveats:  At the HDF5 level, this works by explicitly listing the set
+        of points to be accessed.  For large, regularly strided selections,
+        you should use the standard n-D slicing syntax, which is significantly
+        faster.
+    """
+    
+    def __init__(self, dset):
+        self._dset = dset
+
+    def __getitem__(self, args):
+        """ Read from the dataset, treating it as a 1-D (C-contiguous) array.
+
+            Allowed slicing mechanisms:
+                1. Ints/longs
+                2. Extended slices
+                3. Sequences of ints/extended slices (e.g. flat[0,1,2])
+
+            Subsets which result in a single element are returned as scalars.
+        """
+        indexer = FlatIndexer(self._dset.shape, args)
+        arr = self._dset[indexer]
+
+        # These match the way NumPy behaves
+        if arr.shape == ():
+            return numpy.asscalar(arr)
+        return arr.newbyteorder('=')
+
+    def __setitem__(self, args, val):
+        """ Write to the dataset, treating it as a 1-D (C-contiguous) array.
+
+            Allowed slicing mechanisms:
+                1. Ints/longs
+                2. Extended slices
+                3. Sequences of ints/extended slices (e.g. flat[0,1,2])
+        """
+        indexer = FlatIndexer(self._dset.shape, args)
+        self._dset[indexer] = val
+
 class Dataset(HLObject):
 
     """ High-level interface to an HDF5 dataset.
@@ -430,6 +487,9 @@ class Dataset(HLObject):
     dtype = property(lambda self: self.id.dtype,
         doc = "Numpy dtype representing the datatype")
 
+    flat = property(lambda self: FlatIndexProxy(self),
+        doc = "1-D read/write slicing access to the dataset.  Not iterable.")
+
     def _getval(self):
         with self._lock:
             arr = self[...]
@@ -442,7 +502,8 @@ class Dataset(HLObject):
 
     def __init__(self, group, name,
                     shape=None, dtype=None, data=None,
-                    chunks=None, compression=None, shuffle=False, fletcher32=False):
+                    chunks=None, compression=None, shuffle=False,
+                    fletcher32=False, maxshape=None):
         """ Construct a Dataset object.  You might find it easier to use the
             Group methods: Group["name"] or Group.create_dataset().
 
@@ -469,6 +530,11 @@ class Dataset(HLObject):
             compression:   DEFLATE (gzip) compression level, int or None*
             shuffle:       Use the shuffle filter? (requires compression) T/F*
             fletcher32:    Enable Fletcher32 error detection? T/F*
+
+            maxshape:      Tuple giving dataset maximum dimensions.  You can
+                           grow each axis up to this limit using extend(). For
+                           an unlimited axis, provide None.  Can only be used
+                           with chunks.
         """
         with group._lock:
             if data is None and shape is None:
@@ -501,7 +567,10 @@ class Dataset(HLObject):
                 if fletcher32:
                     plist.set_fletcher32()
 
-                space_id = h5s.create_simple(shape)
+                if maxshape is not None:
+                    maxshape = tuple(x if x is not None else h5s.UNLIMITED for x in maxshape)
+
+                space_id = h5s.create_simple(shape, maxshape)
                 type_id = h5t.py_create(dtype)
 
                 self.id = h5d.create(group.id, name, type_id, space_id, plist)
@@ -510,6 +579,34 @@ class Dataset(HLObject):
 
             self._attrs = AttributeManager(self)
 
+    def extend(self, shape):
+        """ Resize the dataset so it's at least as big as "shape".
+
+            Note that the new shape must be compatible with the "maxshape"
+            argument provided when the dataset was created.  Also, the rank of
+            the dataset cannot be changed.
+        """
+        with self._lock:
+            self.id.extend(shape)
+
+    def __len__(self):
+        """ The size of the first axis.  TypeError if scalar.
+        """
+        shape = self.shape
+        if len(shape) == 0:
+            raise TypeError("Attempt to take len() of scalar dataset")
+        return shape[0]
+
+    def __iter__(self):
+        """ Iterate over the first axis.  TypeError if scalar.  Modifications
+            to the yielded data are *NOT* recorded.
+        """
+        shape = self.shape
+        if len(shape) == 0:
+            raise TypeError("Can't iterate over a scalar dataset")
+        for i in xrange(shape[0]):
+            yield self[i]
+
     def __getitem__(self, args):
         """ Read a slice from the HDF5 dataset.  Takes slices and
             recarray-style field names (more than one is allowed!) in any
@@ -560,12 +657,11 @@ class Dataset(HLObject):
 
             if len(names) == 1:
                 # Match Numpy convention for recarray indexing
-                return arr[names[0]].squeeze()
+                arr = arr[names[0]]
             return arr.squeeze()
 
-
     def __setitem__(self, args, val):
-        """ Write to the HDF5 dataset from an Numpy array.  The shape of the
+        """ Write to the HDF5 dataset from a Numpy array.  The shape of the
             Numpy array must match the shape of the selection, and the Numpy
             array's datatype must be convertible to the HDF5 datatype.
         """
@@ -652,7 +748,7 @@ class AttributeManager(LockableObject):
     def __setitem__(self, name, value):
         """ Set the value of an attribute, overwriting any previous value.
             The value you provide must be convertible to a Numpy array or
-            scalar.  If it's not, the action is aborted with no data loss.
+            scalar.
 
             Any existing value is destroyed just before the call to h5a.create.
             If the creation fails, the data is not recoverable.
diff --git a/h5py/tests/test_highlevel.py b/h5py/tests/test_highlevel.py
index 320cf62..5236d47 100644
--- a/h5py/tests/test_highlevel.py
+++ b/h5py/tests/test_highlevel.py
@@ -196,6 +196,57 @@ class TestDataset(unittest.TestCase):
                     self.assert_(numpy.all(d.value == srcarr))
                     del self.f["NewDataset"]               
 
+    def test_Dataset_extend(self):
+
+        print ""
+
+        init_shapes = [(100,), (100,100), (150,100)]
+        max_shapes = [(200,), (200,200), (None, 100)]
+        chunks = [(10,), (10,10), (10,10)]
+
+        final_shapes = {(100,): [ (150,), (200,) ],
+                        (100,100): [(200,100), (200,200)],
+                        (150,100): [ (200,100), (300,100), (500,100)] }
+
+        illegal_shapes = {(100,): [(250,)], (100,100): [(250,100), (250,250)],
+                          (150,100): [(200,150)] }
+
+        for shape, maxshape, chunk in zip(init_shapes, max_shapes, chunks):
+            srcarr = numpy.arange(numpy.product(shape)).reshape(shape)
+            if "DS" in self.f:
+                del self.f["DS"]
+            ds = self.f.create_dataset("DS", data=srcarr, maxshape=maxshape, chunks=chunk)
+
+            self.assertEqual(ds.shape, shape)
+
+            for final_shape in final_shapes[shape]:
+                print "    Extending %s to %s" % (shape, final_shape)
+                newarr = numpy.arange(numpy.product(final_shape)).reshape(final_shape)
+                ds.extend(final_shape)
+                ds[...] = newarr
+                self.assertEqual(ds.shape, final_shape)
+                self.assert_(numpy.all(ds[...] == newarr))
+
+            for illegal_shape in illegal_shapes[shape]:
+                self.assertRaises(H5Error, ds.extend, illegal_shape)
+        
+    def test_Dataset_len_iter(self):
+
+        arr1 = numpy.arange(100).reshape((10,10))
+        arr2 = numpy.ones(())
+
+        d1 = self.f.create_dataset("D1", data=arr1)
+        d2 = self.f.create_dataset("D2", data=arr2)
+
+        self.assertEqual(len(arr1), len(d1))
+        self.assertRaises(TypeError, d2, len)
+
+        for idx, (hval, nval) in enumerate(zip(d1, arr1)):
+            self.assert_(numpy.all(hval == nval))
+        
+        self.assertEqual(idx+1, len(arr1))
+        self.assertRaises(TypeError, list, d2)
+
     def test_Dataset_slicing(self):
 
         print ''
@@ -207,7 +258,7 @@ class TestDataset(unittest.TestCase):
         slices += [ s[0:7:2,0:9:3,15:43:5], s[2:8:2,...] ]
         slices += [ s[0], s[1], s[9], s[:] ] # Numpy convention
         slices += [ numpy.random.random((10,10,50)) > 0.5 ]  # Truth array
-        
+       
         for dt in TYPES1:
 
             srcarr = numpy.arange(10*10*50, dtype=dt).reshape(10,10,50)
@@ -242,6 +293,58 @@ class TestDataset(unittest.TestCase):
                 f.close()
                 os.unlink(fname)   
 
+    def test_Dataset_flat(self):
+
+        print ""
+
+        s = SliceFreezer()
+
+        flatindexing = [0, 1, 45, 355]
+        flatindexing += [s[0:500:2], s[0:644], s[3:99], s[35:655:3]]
+        flatindexing += [s[:45:], s[::3], s[:78:4]]
+
+        extended = [ (0,1,3,2) ]
+
+        for dt in TYPES1:
+
+            srcarr = numpy.arange(10*10*20, dtype=dt).reshape(10,10,20)
+            srcarr = srcarr + numpy.sin(srcarr)
+
+            fname = tempfile.mktemp('.hdf5')
+            f = File(fname, 'w')
+            try:
+                d = Dataset(f, "NewDataset", data=srcarr)
+                self.assertEqual(d.shape, srcarr.shape)
+                self.assertEqual(d.dtype, srcarr.dtype)
+                for idx in flatindexing:
+                    print "    Checking flat read %.20s %s" % (dt, idx)
+                    hresult = d.flat[idx]
+                    nresult = srcarr.flat[idx]
+                    if hasattr(hresult, 'shape'):
+                        self.assertEqual(hresult.shape, nresult.shape)
+                        self.assertEqual(hresult.dtype, nresult.dtype)
+                        self.assert_(numpy.all(hresult == nresult), "%s\n%s" % (hresult, nresult))
+                    else:
+                        self.assertEqual(hresult, numpy.asscalar(nresult))
+
+                del f["NewDataset"]
+                d = Dataset(f, "NewDataset", data=srcarr)
+                for idx in flatindexing:
+                    print "    Checking flat write %.20s %s" % (dt, idx)
+                    srcarr.flat[idx] = numpy.cos(srcarr.flat[idx])
+                    d.flat[idx] = srcarr.flat[idx]
+                    self.assert_(numpy.all(d.value == srcarr))
+
+                del f["NewDataset"]
+                d = Dataset(f, "NewDataset", data=srcarr)
+                for seq in extended:
+                    subset = d.flat[seq]
+                    for idx, entry in enumerate(seq):
+                        self.assertEqual(subset[idx], srcarr.flat[entry])
+            finally:
+                f.close()
+                os.unlink(fname)   
+
     def test_Dataset_exceptions(self):
         # These trigger exceptions in H5Dread
         ref = numpy.ones((10,10), dtype='<i4')
diff --git a/h5py/utils_hl.py b/h5py/utils_hl.py
index daef943..7011daa 100644
--- a/h5py/utils_hl.py
+++ b/h5py/utils_hl.py
@@ -16,6 +16,33 @@ def hbasename(name):
         bname = '/'
     return bname
 
+class FlatIndexer(object):
+
+    """
+        Utility class which encapsulates a 1-D selection into an n-D array.
+
+    """
+
+    def __init__(self, shape, args):
+        """ Shape must be a tuple; args must be iterable.
+        """
+        try:
+            args = iter(args)
+        except TypeError:
+            args = (args,)
+
+        points = []
+
+        for arg in args:
+            if isinstance(arg, slice):
+                points.extend(xrange(*arg.indices(numpy.product(shape))))
+            elif isinstance(arg, int) or isinstance(arg, long):
+                points.append(arg)
+            else:
+                raise ValueError("Illegal index (ints, longs or slices only)")
+
+        self.coords = numpy.array([numpy.unravel_index(x, shape) for x in points])
+
 def slice_select(space, args):
     """ Perform a selection on the given HDF5 dataspace, using a tuple
         of Python extended slice objects.  The dataspace may be scalar or
@@ -26,8 +53,9 @@ def slice_select(space, args):
 
         1-tuple:
             1. A single Ellipsis: entire dataspace selected
-            2. A NumPy array: element-wise selection
-            3. A single integer or slice (row-broadcasting)
+            2. A single integer or slice (row-broadcasting)
+            3. A NumPy array: element-wise selection
+            4. A FlatIndexer instance containing a coordinate list
 
         n-tuple:
             1. slice objects
@@ -50,6 +78,11 @@ def slice_select(space, args):
             space.select_elements(indices)
             return h5s.create_simple((len(indices),))
 
+        if isinstance(argval, FlatIndexer):
+            space.select_elements(argval.coords)
+            npoints = space.get_select_elem_npoints()
+            return h5s.create_simple((npoints,))
+
         # Single-index obj[0] access is always equivalent to obj[0,...].
         # Pack it back up and send it to the hyperslab machinery
         args = (argval, Ellipsis)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/h5py.git