[python-hdf5storage] 17/152: Added low level, somewhat buggy for strings, read support for Numpy types.

Mon Feb 29 08:24:30 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to annotated tag 0.1
in repository python-hdf5storage.

commit 7ebe9aaa91db497d7e4febbfb2a8fa4b752fcbe4
Author: Freja Nordsiek <fnordsie at gmail.com>
Date:   Tue Jan 21 19:03:47 2014 -0500

    Added low level, somewhat buggy for strings, read support for Numpy types.
---
 hdf5storage/Marshallers.py | 120 ++++++++++++++++++++++++++-
 hdf5storage/core.py        |   4 +-
 hdf5storage/lowlevel.py    |  51 ++++++++++++
 hdf5storage/utilities.py   | 200 ++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 370 insertions(+), 5 deletions(-)

diff --git a/hdf5storage/Marshallers.py b/hdf5storage/Marshallers.py
index 98d9191..9b2144b 100644
--- a/hdf5storage/Marshallers.py
+++ b/hdf5storage/Marshallers.py
@@ -326,6 +326,23 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
                                  np.string_: 'char',
                                  np.unicode: 'char'}
 
+        # Make a dict to look up the opposite direction (given a matlab
+        # class, what numpy type to use.
+
+        self.__MATLAB_classes_reverse = {'logical': np.bool8,
+                                         'uint8': np.uint8,
+                                         'uint16': np.uint16,
+                                         'uint32': np.uint32,
+                                         'uint64': np.uint64,
+                                         'int8': np.int8,
+                                         'int16': np.int16,
+                                         'int32': np.int32,
+                                         'int64': np.int64,
+                                         'single': np.float32,
+                                         'double': np.float64,
+                                         'char': np.unicode}
+
+
         # Set matlab_classes to the supported classes (the values).
         self.matlab_classes = list(self.__MATLAB_classes.values())
 
@@ -461,6 +478,105 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
             else:
                 del_attribute(grp[name], 'MATLAB_int_decode')
 
+    def read(self, f, grp, name, options):
+        # If name is not present or is not a Dataset, then we can't read
+        # it and have to throw an error.
+        if name not in grp or not isinstance(grp[name], h5py.Dataset):
+            raise NotImplementedError('No Dataset ' + name +
+                                      ' is present.')
+
+        # Grab the data's datatype and dtype.
+        datatype = grp[name].id.get_type()
+        h5_dt = datatype.dtype
+
+        # Get the different attributes this marshaller uses.
+
+        type_string = get_attribute_string(grp[name], 'CPython.Type')
+        underlying_type = get_attribute_string(grp[name], \
+            'CPython.numpy.UnderlyingType')
+        shape = get_attribute(grp[name], 'CPython.Shape')
+        cpython_empty = get_attribute(grp[name], 'CPython.Empty')
+
+        matlab_class = get_attribute_string(grp[name], 'MATLAB_class')
+        matlab_empty = get_attribute(grp[name], 'MATLAB_empty')
+
+        # Read the data and get its dtype. Figuring it out and doing any
+        # conversions can be done later.
+        data = grp[name][...]
+        dt = data.dtype
+
+        # If metadata is present, that can be used to do convert to the
+        # desired/closest Python data types. If none is present, or not
+        # enough of it, then no conversions can be done.
+
+        if type_string is not None and underlying_type and \
+                shape is not None:
+            # If it is empty ('CPython.Empty' set to 1), then the shape
+            # information is stored in data and we need to set data to
+            # the empty array of the proper type (in underlying_type)
+            # and the given shape.
+            if cpython_empty == 1:
+                data = np.zeros(tuple(np.uint64(data)),
+                                dtype=underlying_type)
+
+            # If MATLAB attributes are present or the reverse dimension
+            # order option was given, the dimension order needs to be
+            # reversed. This needs to be done before any reshaping as
+            # the shape was stored before any dimensional reordering.
+            if matlab_class is not None or \
+                    options.reverse_dimension_order:
+                data = data.T
+
+            # If the shape of data and the shape attribute are
+            # different but give the same number of elements, then data
+            # needs to be reshaped.
+            if tuple(shape) != data.shape \
+                    and np.prod(shape) == np.prod(data.shape):
+                data.shape = tuple(shape)
+
+            # String types might have to be decoded depending on the
+            # underlying type, and MATLAB class if given.
+            if underlying_type[0:5] == 'bytes':
+                data = decode_to_numpy_ascii(data)
+            elif underlying_type[0:3] == 'str' \
+                    or matlab_class == 'char':
+                data = decode_to_numpy_unicode(data)
+
+            # If it is a complex type, then it needs to be decoded
+            # properly.
+            if underlying_type[0:7] == 'complex':
+                data = decode_complex(data)
+
+        elif matlab_class in self.__MATLAB_classes_reverse:
+            # MATLAB formatting information was given. The extraction
+            # did most of the work except handling empties, array
+            # dimension order, and string conversion.
+
+            # If it is empty ('MATLAB_empty' set to 1), then the shape
+            # information is stored in data and we need to set data to
+            # the empty array of the proper type.
+            if matlab_empty == 1:
+                data = np.zeros(tuple(np.uint64(data)), \
+                    dtype=self.__MATLAB_classes_reverse[matlab_class])
+
+            # The order of the dimensions must be switched from Fortran
+            # order which MATLAB uses to C order which Python uses.
+            data = data.T
+
+            # Now, if the matlab class is 'single' or 'double', data
+            # could possibly be a complex type which needs to be
+            # properly decoded.
+            if matlab_class in ['single', 'double']:
+                data = decode_complex(data)
+
+            # If it is a 'char' type, the proper conversion to
+            # numpy.unicode needs to be done.
+            if matlab_class == 'char':
+                data = decode_to_numpy_unicode(data)
+
+        # Done adjusting data, so it can be returned.
+        return data
+
 
 class PythonScalarMarshaller(NumpyScalarArrayMarshaller):
     def __init__(self):
@@ -499,7 +615,7 @@ class PythonStringMarshaller(NumpyScalarArrayMarshaller):
         # is a bytearray in which case it needs to be converted to a
         # uint8 array.
 
-        if isinstance(data,bytearray):
+        if isinstance(data, bytearray):
             cdata = np.uint8(data)
         else:
             cdata = np.string_(data)
@@ -526,7 +642,7 @@ class PythonNoneMarshaller(NumpyScalarArrayMarshaller):
         # data and the right type_string set (parent can't guess right
         # from the modified form).
         NumpyScalarArrayMarshaller.write(self, f, grp, name,
-                                         np.ndarray(shape=(0,0),
+                                         np.ndarray(shape=(0, 0),
                                          dtype='float64'),
                                          self.get_type_string(data,
                                          type_string), options)
diff --git a/hdf5storage/core.py b/hdf5storage/core.py
index 5479f0e..cf78928 100644
--- a/hdf5storage/core.py
+++ b/hdf5storage/core.py
@@ -38,7 +38,7 @@ import h5py
 
 from hdf5storage.utilities import *
 
-from hdf5storage.lowlevel import write_data
+from hdf5storage.lowlevel import write_data, read_data
 from hdf5storage import Marshallers
 
 
@@ -589,7 +589,7 @@ class MarshallerCollection(object):
         hdf5storage.Marshallers.TypeMarshaller.cpython_type_strings
 
         """
-        if type_string in self._matlab_classes:
+        if matlab_class in self._matlab_classes:
             return copy.deepcopy(self._matlab_classes[matlab_class])
         else:
             return None
diff --git a/hdf5storage/lowlevel.py b/hdf5storage/lowlevel.py
index d5cbf88..04cdbd3 100644
--- a/hdf5storage/lowlevel.py
+++ b/hdf5storage/lowlevel.py
@@ -24,12 +24,23 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import posixpath
+
 import numpy as np
 import h5py
 
 from hdf5storage.utilities import *
 
 
+class Hdf5storageError(IOError):
+    """ Base class of hdf5storage package exceptions."""
+    pass
+
+
+class CantReadError(Hdf5storageError):
+    """ Exception for a failure to read the desired data."""
+
+
 def write_data(f, grp, name, data, type_string, options):
     # Get the marshaller for type(data).
 
@@ -67,3 +78,43 @@ def write_data(f, grp, name, data, type_string, options):
                         del_attribute(outputs[0][i][v[0]], 'H5PATH')
     else:
         raise NotImplementedError('Can''t write data type: '+str(tp))
+
+
+def read_data(f, grp, name, options):
+    # If name isn't found, return error.
+    if name not in grp:
+        raise CantReadError('Could not find '
+                            + posixpath.join(grp.name, name))
+
+    # Get the different attributes that can be used to identify they
+    # type, which are the type string and the MATLAB class.
+    type_string = get_attribute_string(grp[name], 'CPython.Type')
+    matlab_class = get_attribute_string(grp[name], 'MATLAB_class')
+
+    # If the type_string is present, get the marshaller for it. If it is
+    # not, use the one for the matlab class if it is given. Otherwise,
+    # use the fallback (NumpyScalarArrayMarshaller for Datasets and
+    # PythonDictMarshaller for Groups). If calls to the marshaller
+    # collection to get the right marshaller don't return one (return
+    # None, we also go to the default).
+
+    m = None
+    mc = options.marshaller_collection
+    if type_string is not None:
+        m = mc.get_marshaller_for_type_string(type_string)
+    elif matlab_class is not None:
+        m = mc.get_marshaller_for_matlab_class(matlab_class)
+
+    if m is None:
+        if isinstance(grp[name], h5py.Dataset):
+            m = mc.get_marshaller_for_type(np.uint8)
+        else:
+            m = mc.get_marshaller_for_type(dict)
+
+    # If a marshaller was found, use it to write the data. Otherwise,
+    # return an error.
+
+    if m is not None:
+        return m.read(f, grp, name, options)
+    else:
+        raise CantReadError('Could not read ' + grp[name].name)
diff --git a/hdf5storage/utilities.py b/hdf5storage/utilities.py
index cd2acd3..c8f1668 100644
--- a/hdf5storage/utilities.py
+++ b/hdf5storage/utilities.py
@@ -31,12 +31,173 @@ import numpy as np
 import h5py
 
 
+def decode_to_numpy_unicode(data):
+    """ Decodes data to Numpy unicode string (str_).
+
+    Decodes `data` to a  Numpy unicode string (UTF-32), which is
+    ``numpy.str_``. If it can't be decoded, it is returned as
+    is. Unsigned integers, Python string types (``str``, ``bytes``), and
+    ``numpy.string_`` are supported.
+
+    Parameters
+    ----------
+    data : some type
+        Data decode into a Numpy unicode string.
+
+    Returns
+    -------
+    numpy.str_ or data
+        If `data` can be decoded into a ``numpy.str_``, the decoded
+        version is returned. Otherwise, `data` is returned unchanged.
+
+    See Also
+    --------
+    decode_to_numpy_ascii
+    numpy.str_
+
+    """
+    # How the conversion is done depends on the exact  underlying
+    # type. For uint types, it is assumed to be stored as ASCII, UTF-16,
+    # or UTF-32 depending on the size.
+    if isinstance(data, (np.ndarray, np.uint8, np.uint16, np.uint32)):
+        if data.dtype.name == 'uint8':
+            data = data.data.tobytes().decode(encoding='ASCII')
+        elif data.dtype.name == 'uint16':
+            data = data.data.tobytes().decode(encoding='UTF-16')
+        elif data.dtype.name == 'uint32':
+            data = data.data.tobytes().decode(encoding='UTF-32')
+
+    if isinstance(data, str):
+        return np.unicode(data)
+    elif isinstance(data, (bytes, np.string_)):
+        return np.unicode(data.decode())
+    else:
+        return data
+
+
+def decode_to_numpy_ascii(data):
+    """ Decodes data to Numpy ASCII string (string_).
+
+    Decodes `data` to a  Numpy ASCII string, which is
+    ``numpy.string_``. If it can't be decoded, it is returned as
+    is. Unsigned integers, Python string types (``str``, ``bytes``), and
+    ``numpy.str_`` (UTF-32) are supported.
+
+    Parameters
+    ----------
+    data : some type
+        Data decode into a Numpy ASCII string.
+
+    Returns
+    -------
+    numpy.string_ or data
+        If `data` can be decoded into a ``numpy.string_``, the decoded
+        version is returned. Otherwise, `data` is returned unchanged.
+
+    See Also
+    --------
+    decode_to_numpy_unicode
+    numpy.str_
+
+    """
+    # How the conversion is done depends on the exact  underlying
+    # type. For uint types, it is assumed to be stored as ASCII, UTF-16,
+    # or UTF-32 depending on the size when converting to an str. Then,
+    # conversions from str, bytes, and numpy.unicode can be done.
+
+    if isinstance(data, (np.ndarray, np.uint8, np.uint16, np.uint32)):
+        if data.dtype.name == 'uint8':
+            data = data.data.tobytes().decode(encoding='ASCII')
+        elif data.dtype.name == 'uint16':
+            data = data.data.tobytes().decode(encoding='UTF-16')
+        elif data.dtype.name == 'uint32':
+            data = data.data.tobytes().decode(encoding='UTF-32')
+
+    if isinstance(data, bytes):
+        return np.string_(data)
+    elif isinstance(data, (str, np.unicode)):
+        return np.string_(data.encode(encoding='ascii',
+                          errors='replace'))
+    else:
+        return data
+
+
+def decode_complex(data):
+    """ Decodes possibly complex data read from an HDF5 file.
+
+    Decodes possibly complex datasets read from an HDF5 file. HDF5
+    doesn't have a native complex type, so they are stored as
+    H5T_COMPOUND types with fields such as 'r' and 'i' for the real and
+    imaginary parts. As there is no standardization for field names, the
+    field names have to be analyzed for proper decoding. A variety of
+    reasonably expected combinations of field names are checked and used
+    if available to decode. If decoding is not possible, it is returned
+    as is.
+
+    Parameters
+    ----------
+    data : arraylike
+        The data read from an HDF5 file, that might be complex, to
+        decode into the proper Numpy complex type.
+
+    Returns
+    -------
+    decoded data or data
+        If `data` can be decoded into a complex type, the decoded
+        complex version is returned. Otherwise, `data` is returned
+        unchanged.
+
+    """
+    # Now, complex types are stored in HDF5 files as an H5T_COMPOUND type
+    # with fields along the lines of ('r', 're', 'real') and ('i', 'im',
+    # 'imag', 'imaginary') for the real and imaginary parts, which most
+    # likely won't be properly extracted back into making a Python
+    # complex type unless the proper h5py configuration is set. Since we
+    # can't depend on it being set and adjusting it is hazardous (the
+    # setting is global), it is best to just decode it manually. These
+    # fields are obtained from the fields of its dtype. Obviously, if
+    # there are no fields, then there is nothing to do.
+    if data.dtype.fields is None:
+        return data
+
+    fields = list(data.dtype.fields)
+
+    # If there aren't exactly two fields, then it can't be complex.
+    if len(fields) != 2:
+        return data
+
+    # We need to grab the field names for the real and imaginary
+    # parts. This will be done by seeing which list, if any, each field
+    # is and setting variables to the proper name if it is in it (they
+    # are initialized to None so that we know if one isn't found).
+
+    real_name = None
+    imag_name = None
+
+    real_fields = ['r', 're', 'real']
+    imag_fields = ['i', 'im', 'imag', 'imaginary']
+
+    for s in fields:
+        if s.lower() in real_fields:
+            real_name = s
+        elif s.lower() in imag_fields:
+            imag_name = s
+
+    # If the real and imaginary fields were found, construct the complex
+    # form from the fields. Otherwise, return what we were given because
+    # it isn't in the right form.
+    if real_name is not None and imag_name is not None:
+        return data[real_name] + 1j*data[imag_name]
+    else:
+        return data
+
+
 def get_attribute(target, name):
     """ Gets an attribute from a Dataset or Group.
 
     Gets the value of an Attribute if it is present (get ``None`` if
     not).
-    
+
     Parameters
     ----------
     target : Dataset or Group
@@ -56,6 +217,43 @@ def get_attribute(target, name):
     else:
         return target.attrs[name]
 
+
+def get_attribute_string(target, name):
+    """ Gets a string attribute from a Dataset or Group.
+
+    Gets the value of an Attribute that is a string if it is present
+    (get ``None`` if it is not present or isn't a string type).
+
+    Parameters
+    ----------
+    target : Dataset or Group
+        :py:class:`h5py.Dataset` or :py:class:`h5py.Group` to get the
+        string attribute of.
+    name : str
+        Name of the attribute to get.
+
+    Returns
+    -------
+    str or None
+        The ``str`` value of the attribute if it is present, or ``None``
+        if it isn't or isn't a type that can be converted to ``str``
+
+    """
+    value = get_attribute(target, name)
+    if value is None:
+        return value
+    elif isinstance(value, str):
+        return value
+    elif isinstance(value, bytes):
+        return value.decode()
+    elif isinstance(value, np.unicode):
+        return str(value)
+    elif isinstance(value, np.string_):
+        return value.decode()
+    else:
+        return None
+
+
 def set_attribute(target, name, value):
     """ Sets an attribute on a Dataset or Group.
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-hdf5storage.git