[python-hdf5storage] 17/152: Added low level, somewhat buggy for strings, read support for Numpy types.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Feb 29 08:24:30 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to annotated tag 0.1
in repository python-hdf5storage.
commit 7ebe9aaa91db497d7e4febbfb2a8fa4b752fcbe4
Author: Freja Nordsiek <fnordsie at gmail.com>
Date: Tue Jan 21 19:03:47 2014 -0500
Added low level, somewhat buggy for strings, read support for Numpy types.
---
hdf5storage/Marshallers.py | 120 ++++++++++++++++++++++++++-
hdf5storage/core.py | 4 +-
hdf5storage/lowlevel.py | 51 ++++++++++++
hdf5storage/utilities.py | 200 ++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 370 insertions(+), 5 deletions(-)
diff --git a/hdf5storage/Marshallers.py b/hdf5storage/Marshallers.py
index 98d9191..9b2144b 100644
--- a/hdf5storage/Marshallers.py
+++ b/hdf5storage/Marshallers.py
@@ -326,6 +326,23 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
np.string_: 'char',
np.unicode: 'char'}
+ # Make a dict to look up the opposite direction (given a matlab
+ # class, what numpy type to use.
+
+ self.__MATLAB_classes_reverse = {'logical': np.bool8,
+ 'uint8': np.uint8,
+ 'uint16': np.uint16,
+ 'uint32': np.uint32,
+ 'uint64': np.uint64,
+ 'int8': np.int8,
+ 'int16': np.int16,
+ 'int32': np.int32,
+ 'int64': np.int64,
+ 'single': np.float32,
+ 'double': np.float64,
+ 'char': np.unicode}
+
+
# Set matlab_classes to the supported classes (the values).
self.matlab_classes = list(self.__MATLAB_classes.values())
@@ -461,6 +478,105 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
else:
del_attribute(grp[name], 'MATLAB_int_decode')
+ def read(self, f, grp, name, options):
+ # If name is not present or is not a Dataset, then we can't read
+ # it and have to throw an error.
+ if name not in grp or not isinstance(grp[name], h5py.Dataset):
+ raise NotImplementedError('No Dataset ' + name +
+ ' is present.')
+
+ # Grab the data's datatype and dtype.
+ datatype = grp[name].id.get_type()
+ h5_dt = datatype.dtype
+
+ # Get the different attributes this marshaller uses.
+
+ type_string = get_attribute_string(grp[name], 'CPython.Type')
+ underlying_type = get_attribute_string(grp[name], \
+ 'CPython.numpy.UnderlyingType')
+ shape = get_attribute(grp[name], 'CPython.Shape')
+ cpython_empty = get_attribute(grp[name], 'CPython.Empty')
+
+ matlab_class = get_attribute_string(grp[name], 'MATLAB_class')
+ matlab_empty = get_attribute(grp[name], 'MATLAB_empty')
+
+ # Read the data and get its dtype. Figuring it out and doing any
+ # conversions can be done later.
+ data = grp[name][...]
+ dt = data.dtype
+
+ # If metadata is present, that can be used to do convert to the
+ # desired/closest Python data types. If none is present, or not
+ # enough of it, then no conversions can be done.
+
+ if type_string is not None and underlying_type and \
+ shape is not None:
+ # If it is empty ('CPython.Empty' set to 1), then the shape
+ # information is stored in data and we need to set data to
+ # the empty array of the proper type (in underlying_type)
+ # and the given shape.
+ if cpython_empty == 1:
+ data = np.zeros(tuple(np.uint64(data)),
+ dtype=underlying_type)
+
+ # If MATLAB attributes are present or the reverse dimension
+ # order option was given, the dimension order needs to be
+ # reversed. This needs to be done before any reshaping as
+ # the shape was stored before any dimensional reordering.
+ if matlab_class is not None or \
+ options.reverse_dimension_order:
+ data = data.T
+
+ # If the shape of data and the shape attribute are
+ # different but give the same number of elements, then data
+ # needs to be reshaped.
+ if tuple(shape) != data.shape \
+ and np.prod(shape) == np.prod(data.shape):
+ data.shape = tuple(shape)
+
+ # String types might have to be decoded depending on the
+ # underlying type, and MATLAB class if given.
+ if underlying_type[0:5] == 'bytes':
+ data = decode_to_numpy_ascii(data)
+ elif underlying_type[0:3] == 'str' \
+ or matlab_class == 'char':
+ data = decode_to_numpy_unicode(data)
+
+ # If it is a complex type, then it needs to be decoded
+ # properly.
+ if underlying_type[0:7] == 'complex':
+ data = decode_complex(data)
+
+ elif matlab_class in self.__MATLAB_classes_reverse:
+ # MATLAB formatting information was given. The extraction
+ # did most of the work except handling empties, array
+ # dimension order, and string conversion.
+
+ # If it is empty ('MATLAB_empty' set to 1), then the shape
+ # information is stored in data and we need to set data to
+ # the empty array of the proper type.
+ if matlab_empty == 1:
+ data = np.zeros(tuple(np.uint64(data)), \
+ dtype=self.__MATLAB_classes_reverse[matlab_class])
+
+ # The order of the dimensions must be switched from Fortran
+ # order which MATLAB uses to C order which Python uses.
+ data = data.T
+
+ # Now, if the matlab class is 'single' or 'double', data
+ # could possibly be a complex type which needs to be
+ # properly decoded.
+ if matlab_class in ['single', 'double']:
+ data = decode_complex(data)
+
+ # If it is a 'char' type, the proper conversion to
+ # numpy.unicode needs to be done.
+ if matlab_class == 'char':
+ data = decode_to_numpy_unicode(data)
+
+ # Done adjusting data, so it can be returned.
+ return data
+
class PythonScalarMarshaller(NumpyScalarArrayMarshaller):
def __init__(self):
@@ -499,7 +615,7 @@ class PythonStringMarshaller(NumpyScalarArrayMarshaller):
# is a bytearray in which case it needs to be converted to a
# uint8 array.
- if isinstance(data,bytearray):
+ if isinstance(data, bytearray):
cdata = np.uint8(data)
else:
cdata = np.string_(data)
@@ -526,7 +642,7 @@ class PythonNoneMarshaller(NumpyScalarArrayMarshaller):
# data and the right type_string set (parent can't guess right
# from the modified form).
NumpyScalarArrayMarshaller.write(self, f, grp, name,
- np.ndarray(shape=(0,0),
+ np.ndarray(shape=(0, 0),
dtype='float64'),
self.get_type_string(data,
type_string), options)
diff --git a/hdf5storage/core.py b/hdf5storage/core.py
index 5479f0e..cf78928 100644
--- a/hdf5storage/core.py
+++ b/hdf5storage/core.py
@@ -38,7 +38,7 @@ import h5py
from hdf5storage.utilities import *
-from hdf5storage.lowlevel import write_data
+from hdf5storage.lowlevel import write_data, read_data
from hdf5storage import Marshallers
@@ -589,7 +589,7 @@ class MarshallerCollection(object):
hdf5storage.Marshallers.TypeMarshaller.cpython_type_strings
"""
- if type_string in self._matlab_classes:
+ if matlab_class in self._matlab_classes:
return copy.deepcopy(self._matlab_classes[matlab_class])
else:
return None
diff --git a/hdf5storage/lowlevel.py b/hdf5storage/lowlevel.py
index d5cbf88..04cdbd3 100644
--- a/hdf5storage/lowlevel.py
+++ b/hdf5storage/lowlevel.py
@@ -24,12 +24,23 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import posixpath
+
import numpy as np
import h5py
from hdf5storage.utilities import *
+class Hdf5storageError(IOError):
+ """ Base class of hdf5storage package exceptions."""
+ pass
+
+
+class CantReadError(Hdf5storageError):
+ """ Exception for a failure to read the desired data."""
+
+
def write_data(f, grp, name, data, type_string, options):
# Get the marshaller for type(data).
@@ -67,3 +78,43 @@ def write_data(f, grp, name, data, type_string, options):
del_attribute(outputs[0][i][v[0]], 'H5PATH')
else:
raise NotImplementedError('Can''t write data type: '+str(tp))
+
+
+def read_data(f, grp, name, options):
+ # If name isn't found, return error.
+ if name not in grp:
+ raise CantReadError('Could not find '
+ + posixpath.join(grp.name, name))
+
+ # Get the different attributes that can be used to identify they
+ # type, which are the type string and the MATLAB class.
+ type_string = get_attribute_string(grp[name], 'CPython.Type')
+ matlab_class = get_attribute_string(grp[name], 'MATLAB_class')
+
+ # If the type_string is present, get the marshaller for it. If it is
+ # not, use the one for the matlab class if it is given. Otherwise,
+ # use the fallback (NumpyScalarArrayMarshaller for Datasets and
+ # PythonDictMarshaller for Groups). If calls to the marshaller
+ # collection to get the right marshaller don't return one (return
+ # None, we also go to the default).
+
+ m = None
+ mc = options.marshaller_collection
+ if type_string is not None:
+ m = mc.get_marshaller_for_type_string(type_string)
+ elif matlab_class is not None:
+ m = mc.get_marshaller_for_matlab_class(matlab_class)
+
+ if m is None:
+ if isinstance(grp[name], h5py.Dataset):
+ m = mc.get_marshaller_for_type(np.uint8)
+ else:
+ m = mc.get_marshaller_for_type(dict)
+
+ # If a marshaller was found, use it to write the data. Otherwise,
+ # return an error.
+
+ if m is not None:
+ return m.read(f, grp, name, options)
+ else:
+ raise CantReadError('Could not read ' + grp[name].name)
diff --git a/hdf5storage/utilities.py b/hdf5storage/utilities.py
index cd2acd3..c8f1668 100644
--- a/hdf5storage/utilities.py
+++ b/hdf5storage/utilities.py
@@ -31,12 +31,173 @@ import numpy as np
import h5py
+def decode_to_numpy_unicode(data):
+ """ Decodes data to Numpy unicode string (str_).
+
+ Decodes `data` to a Numpy unicode string (UTF-32), which is
+ ``numpy.str_``. If it can't be decoded, it is returned as
+ is. Unsigned integers, Python string types (``str``, ``bytes``), and
+ ``numpy.string_`` are supported.
+
+ Parameters
+ ----------
+ data : some type
+ Data decode into a Numpy unicode string.
+
+ Returns
+ -------
+ numpy.str_ or data
+ If `data` can be decoded into a ``numpy.str_``, the decoded
+ version is returned. Otherwise, `data` is returned unchanged.
+
+ See Also
+ --------
+ decode_to_numpy_ascii
+ numpy.str_
+
+ """
+ # How the conversion is done depends on the exact underlying
+ # type. For uint types, it is assumed to be stored as ASCII, UTF-16,
+ # or UTF-32 depending on the size.
+ if isinstance(data, (np.ndarray, np.uint8, np.uint16, np.uint32)):
+ if data.dtype.name == 'uint8':
+ data = data.data.tobytes().decode(encoding='ASCII')
+ elif data.dtype.name == 'uint16':
+ data = data.data.tobytes().decode(encoding='UTF-16')
+ elif data.dtype.name == 'uint32':
+ data = data.data.tobytes().decode(encoding='UTF-32')
+
+ if isinstance(data, str):
+ return np.unicode(data)
+ elif isinstance(data, (bytes, np.string_)):
+ return np.unicode(data.decode())
+ else:
+ return data
+
+
+def decode_to_numpy_ascii(data):
+ """ Decodes data to Numpy ASCII string (string_).
+
+ Decodes `data` to a Numpy ASCII string, which is
+ ``numpy.string_``. If it can't be decoded, it is returned as
+ is. Unsigned integers, Python string types (``str``, ``bytes``), and
+ ``numpy.str_`` (UTF-32) are supported.
+
+ Parameters
+ ----------
+ data : some type
+ Data decode into a Numpy ASCII string.
+
+ Returns
+ -------
+ numpy.string_ or data
+ If `data` can be decoded into a ``numpy.string_``, the decoded
+ version is returned. Otherwise, `data` is returned unchanged.
+
+ See Also
+ --------
+ decode_to_numpy_unicode
+ numpy.str_
+
+ """
+ # How the conversion is done depends on the exact underlying
+ # type. For uint types, it is assumed to be stored as ASCII, UTF-16,
+ # or UTF-32 depending on the size when converting to an str. Then,
+ # conversions from str, bytes, and numpy.unicode can be done.
+
+ if isinstance(data, (np.ndarray, np.uint8, np.uint16, np.uint32)):
+ if data.dtype.name == 'uint8':
+ data = data.data.tobytes().decode(encoding='ASCII')
+ elif data.dtype.name == 'uint16':
+ data = data.data.tobytes().decode(encoding='UTF-16')
+ elif data.dtype.name == 'uint32':
+ data = data.data.tobytes().decode(encoding='UTF-32')
+
+ if isinstance(data, bytes):
+ return np.string_(data)
+ elif isinstance(data, (str, np.unicode)):
+ return np.string_(data.encode(encoding='ascii',
+ errors='replace'))
+ else:
+ return data
+
+
+def decode_complex(data):
+ """ Decodes possibly complex data read from an HDF5 file.
+
+ Decodes possibly complex datasets read from an HDF5 file. HDF5
+ doesn't have a native complex type, so they are stored as
+ H5T_COMPOUND types with fields such as 'r' and 'i' for the real and
+ imaginary parts. As there is no standardization for field names, the
+ field names have to be analyzed for proper decoding. A variety of
+ reasonably expected combinations of field names are checked and used
+ if available to decode. If decoding is not possible, it is returned
+ as is.
+
+ Parameters
+ ----------
+ data : arraylike
+ The data read from an HDF5 file, that might be complex, to
+ decode into the proper Numpy complex type.
+
+ Returns
+ -------
+ decoded data or data
+ If `data` can be decoded into a complex type, the decoded
+ complex version is returned. Otherwise, `data` is returned
+ unchanged.
+
+ """
+ # Now, complex types are stored in HDF5 files as an H5T_COMPOUND type
+ # with fields along the lines of ('r', 're', 'real') and ('i', 'im',
+ # 'imag', 'imaginary') for the real and imaginary parts, which most
+ # likely won't be properly extracted back into making a Python
+ # complex type unless the proper h5py configuration is set. Since we
+ # can't depend on it being set and adjusting it is hazardous (the
+ # setting is global), it is best to just decode it manually. These
+ # fields are obtained from the fields of its dtype. Obviously, if
+ # there are no fields, then there is nothing to do.
+ if data.dtype.fields is None:
+ return data
+
+ fields = list(data.dtype.fields)
+
+ # If there aren't exactly two fields, then it can't be complex.
+ if len(fields) != 2:
+ return data
+
+ # We need to grab the field names for the real and imaginary
+ # parts. This will be done by seeing which list, if any, each field
+ # is and setting variables to the proper name if it is in it (they
+ # are initialized to None so that we know if one isn't found).
+
+ real_name = None
+ imag_name = None
+
+ real_fields = ['r', 're', 'real']
+ imag_fields = ['i', 'im', 'imag', 'imaginary']
+
+ for s in fields:
+ if s.lower() in real_fields:
+ real_name = s
+ elif s.lower() in imag_fields:
+ imag_name = s
+
+ # If the real and imaginary fields were found, construct the complex
+ # form from the fields. Otherwise, return what we were given because
+ # it isn't in the right form.
+ if real_name is not None and imag_name is not None:
+ return data[real_name] + 1j*data[imag_name]
+ else:
+ return data
+
+
def get_attribute(target, name):
""" Gets an attribute from a Dataset or Group.
Gets the value of an Attribute if it is present (get ``None`` if
not).
-
+
Parameters
----------
target : Dataset or Group
@@ -56,6 +217,43 @@ def get_attribute(target, name):
else:
return target.attrs[name]
+
+def get_attribute_string(target, name):
+ """ Gets a string attribute from a Dataset or Group.
+
+ Gets the value of an Attribute that is a string if it is present
+ (get ``None`` if it is not present or isn't a string type).
+
+ Parameters
+ ----------
+ target : Dataset or Group
+ :py:class:`h5py.Dataset` or :py:class:`h5py.Group` to get the
+ string attribute of.
+ name : str
+ Name of the attribute to get.
+
+ Returns
+ -------
+ str or None
+ The ``str`` value of the attribute if it is present, or ``None``
+ if it isn't or isn't a type that can be converted to ``str``
+
+ """
+ value = get_attribute(target, name)
+ if value is None:
+ return value
+ elif isinstance(value, str):
+ return value
+ elif isinstance(value, bytes):
+ return value.decode()
+ elif isinstance(value, np.unicode):
+ return str(value)
+ elif isinstance(value, np.string_):
+ return value.decode()
+ else:
+ return None
+
+
def set_attribute(target, name, value):
""" Sets an attribute on a Dataset or Group.
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-hdf5storage.git
More information about the debian-science-commits
mailing list