[python-hdf5storage] 120/152: Added read support for structured numpy.ndarrays as they are, as opposed to reading them as dicts.

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Mon Feb 29 08:24:40 UTC 2016


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to annotated tag 0.1
in repository python-hdf5storage.

commit 84dc2ffdf6ce3f506fc0fe5a0970549f75b9231e
Author: Freja Nordsiek <fnordsie at gmail.com>
Date:   Fri Feb 14 16:53:42 2014 -0500

    Added read support for structured numpy.ndarrays as they are, as opposed to reading them as dicts.
---
 hdf5storage/Marshallers.py | 144 +++++++++++++++++++++++++++++++++++++--------
 hdf5storage/utilities.py   |  26 ++++++++
 2 files changed, 146 insertions(+), 24 deletions(-)

diff --git a/hdf5storage/Marshallers.py b/hdf5storage/Marshallers.py
index 79c8d07..1148009 100644
--- a/hdf5storage/Marshallers.py
+++ b/hdf5storage/Marshallers.py
@@ -459,7 +459,8 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
         TypeMarshaller.__init__(self)
         self.python_attributes |= {'Python.Shape', 'Python.Empty',
                                    'Python.numpy.UnderlyingType',
-                                   'Python.numpy.Container'}
+                                   'Python.numpy.Container',
+                                   'Python.numpy.Fields'}
         self.matlab_attributes |= {'MATLAB_class', 'MATLAB_empty',
                                    'MATLAB_int_decode'}
         self.types = [np.ndarray, np.matrix,
@@ -659,16 +660,24 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
 
             grp2 = grp[name]
 
+            # Grab the list of fields.
+            field_names = list(data_to_store.dtype.names)
+
             # Write the metadata, and set the MATLAB_class to 'struct'
-            # explicitly.
+            # explicitly. Then, we set the 'Python.numpy.Fields'
+            # Attribute to the field names if we are storing python
+            # metadata.
             self.write_metadata(f, grp, name, data, type_string,
                                 options)
             if options.matlab_compatible:
                 set_attribute_string(grp[name], 'MATLAB_class',
                                      'struct')
-
-            # Grab the list of fields.
-            field_names = list(data_to_store.dtype.fields.keys())
+            if options.store_python_metadata:
+                set_attribute_string_array(grp[name],
+                                           'Python.numpy.Fields',
+                                           field_names)
+            else:
+                del_attribute(grp[name], 'Python.numpy.Fields')
 
             # Delete any Datasets/Groups not corresponding to a field
             # name in data if that option is set.
@@ -689,6 +698,13 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
                 for index, x in np.ndenumerate(data_to_store):
                     new_data[index] = x[field]
 
+                # If we are supposed to reverse dimension order, it has
+                # already been done, but write_data expects that it
+                # hasn't, so it needs to be reversed again before
+                # passing it on.
+                if options.reverse_dimension_order:
+                    new_data = new_data.T
+
                 write_data(f, grp2, field, new_data, None, options)
 
                 if field in grp2:
@@ -723,10 +739,12 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
                 grp[name][...] = data_to_store
 
             # Write the metadata using the inherited function (good
-            # enough).
+            # enough). The Attribute 'Python.numpy.fields, if present,
+            # needs to be deleted since this isn't a structured ndarray.
 
             self.write_metadata(f, grp, name, data, type_string,
                                 options)
+            del_attribute(grp[name], 'Python.numpy.Fields')
 
     def write_metadata(self, f, grp, name, data, type_string, options):
         # First, call the inherited version to do most of the work.
@@ -802,15 +820,10 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
             del_attribute(grp[name], 'MATLAB_int_decode')
 
     def read(self, f, grp, name, options):
-        # If name is not present or is not a Dataset, then we can't read
-        # it and have to throw an error.
-        if name not in grp or not isinstance(grp[name], h5py.Dataset):
-            raise NotImplementedError('No Dataset ' + name +
-                                      ' is present.')
-
-        # Grab the data's datatype and dtype.
-        datatype = grp[name].id.get_type()
-        h5_dt = datatype.dtype
+        # If name is not present, then we can't read it and have to
+        # throw an error.
+        if name not in grp:
+            raise NotImplementedError(name + ' is not present.')
 
         # Get the different attributes this marshaller uses.
 
@@ -821,20 +834,103 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
         container = get_attribute_string(grp[name], \
             'Python.numpy.Container')
         python_empty = get_attribute(grp[name], 'Python.Empty')
+        python_fields = get_attribute_string_array(grp[name], \
+            'Python.numpy.Fields')
 
         matlab_class = get_attribute_string(grp[name], 'MATLAB_class')
         matlab_empty = get_attribute(grp[name], 'MATLAB_empty')
 
-        # Read the data and get its dtype. Figuring it out and doing any
-        # conversions can be done later.
-        data = grp[name][...]
-        dt = data.dtype
+        # If it is a Dataset, it can simply be read and then acted upon
+        # (if it is an HDF5 Reference array, it will need to be read
+        # recursively). If it is a Group, then it is a structured
+        # ndarray like object that needs to be read field wise and
+        # constructed.
+        if isinstance(grp[name], h5py.Dataset):
+            # Read the data.
+            data = grp[name][...]
+
+            # If it is a reference type, then we need to make an object
+            # array that is its replicate, but with the objects they are
+            # pointing to in their elements instead of just the
+            # references.
+            if h5py.check_dtype(ref=grp[name].dtype) is not None:
+                data = read_object_array(f, data, options)
+        else:
+            # Starting with an empty dict, all that has to be done is
+            # iterate through all the Datasets and Groups in grp[name]
+            # and add them to a dict with their name as the key. Since
+            # we don't want an exception thrown by reading an element to
+            # stop the whole reading process, the reading is wrapped in
+            # a try block that just catches exceptions and then does
+            # nothing about them (nothing needs to be done).
+            struct_data = dict()
+            for k in grp[name]:
+                # We must exclude group_for_references
+                if grp[name][k].name == options.group_for_references:
+                    continue
+                try:
+                    struct_data[k] = read_data(f, grp[name], k, options)
+                except:
+                    pass
+
+            # The dtype for the structured ndarray needs to be
+            # composed. This is done by going through each field (in the
+            # proper order, if the fields were given, or any order if
+            # not) and determine the dtype and shape of that field to
+            # put in the list.
 
-        # If it is a reference type, then we need to make an object
-        # array that is its replicate, but with the objects they are
-        # pointing to in their elements instead of just the references.
-        if h5py.check_dtype(ref=grp[name].dtype) is not None:
-            data = read_object_array(f, data, options)
+            if python_fields is None:
+                fields = struct_data.keys()
+            else:
+                fields = python_fields
+
+            dt_whole = []
+            for k in fields:
+                v = struct_data[k]
+
+                # If any of the elements are not Numpy types or if they
+                # don't all have the exact same dtype and shape, then
+                # this field will just be an object field.
+                first = v.flatten()[0]
+                if not isinstance(first, tuple(self.types)):
+                    dt_whole.append((k, 'object'))
+                    continue
+
+                dt = first.dtype
+                sp = first.shape
+                all_same = True
+                for index, x in np.ndenumerate(v):
+                    if not isinstance(x, tuple(self.types)) \
+                            or dt != x.dtype or sp != x.shape:
+                        all_same = False
+                        break
+
+                # If they are all the same, then dt and shape should be
+                # used. Otherwise, it has to be object.
+                if all_same:
+                    dt_whole.append((k, dt, sp))
+                else:
+                    dt_whole.append((k, 'object'))
+
+            # Make the structured ndarray with the constructed
+            # dtype. The shape is simply the shape of the object arrays
+            # of its fields, so we might as well use the shape of
+            # v. Then, all the elements of every field need to be
+            # assigned.
+            data = np.zeros(shape=v.shape, dtype=dt_whole)
+            for k, v in struct_data.items():
+                for index, x in np.ndenumerate(v):
+                    data[k][index] = x
+
+            # If the file was formatted for matlab or we otherwise
+            # reverse dimension order, then the dimensions already got
+            # reversed by using the shape of v, so we need to transpose
+            # again so that later, we get the right shape when
+            # transposed again.
+            if matlab_class is not None or \
+                    options.reverse_dimension_order:
+                #data = data.T
+                pass
 
         # If metadata is present, that can be used to do convert to the
         # desired/closest Python data types. If none is present, or not
diff --git a/hdf5storage/utilities.py b/hdf5storage/utilities.py
index cc531b4..2c352ee 100644
--- a/hdf5storage/utilities.py
+++ b/hdf5storage/utilities.py
@@ -691,6 +691,32 @@ def get_attribute_string(target, name):
         return None
 
 
+def get_attribute_string_array(target, name):
+    """ Gets a string array Attribute from a Dataset or Group.
+
+    Gets the value of an Attribute that is a string array if it is
+    present (get ``None`` if not).
+
+    Parameters
+    ----------
+    target : Dataset or Group
+        Dataset or Group to get the attribute of.
+    name : str
+        Name of the string array Attribute to get.
+
+    Returns
+    -------
+    list of str or None
+        The string array value of the Attribute if it is present, or
+        ``None`` if it isn't.
+
+    """
+    value = get_attribute(target, name)
+    if value is None:
+        return value
+    return [decode_to_str(x) for x in value]
+
+
 def set_attribute(target, name, value):
     """ Sets an attribute on a Dataset or Group.
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-hdf5storage.git



More information about the debian-science-commits mailing list