[python-hdf5storage] 54/152: Added read/write support for numpy.object_ types (become HDF5 references).

Mon Feb 29 08:24:33 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to annotated tag 0.1
in repository python-hdf5storage.

commit 1083c70d09abbb937980c4513d4d8b4cf7bfb63a
Author: Freja Nordsiek <fnordsie at gmail.com>
Date:   Tue Jan 28 02:36:51 2014 -0500

    Added read/write support for numpy.object_ types (become HDF5 references).
---
 README.rst                             |  2 +
 doc/source/hdf5storage.Marshallers.rst |  6 +--
 doc/source/hdf5storage.utilities.rst   |  6 +++
 hdf5storage/Marshallers.py             | 77 ++++++++++++++++++++++++++++++++--
 hdf5storage/__init__.py                | 37 ++++++++++++++++
 hdf5storage/utilities.py               | 33 +++++++++++++++
 6 files changed, 154 insertions(+), 7 deletions(-)

diff --git a/README.rst b/README.rst
index 2266959..02238ae 100644
--- a/README.rst
+++ b/README.rst
@@ -82,6 +82,7 @@ np.complex64   0.1                          single   0.1
 np.complex128  0.1                          double   0.1
 np.str\_       0.1      np.uint32           uint32   0.1 [2]_
 np.bytes\_     0.1                          char     0.1
+np.object\_    0.1                          cell     0.1
 dict           0.1                          struct   0.1 [3]_
 =============  =======  ==================  =======  ========
 
@@ -108,6 +109,7 @@ int16         0.1      np.int16
 int32         0.1      np.int32
 int64         0.1      np.int64
 struct        0.1      dict [5]_
+cell          0.1      np.object\_
 ============  =======  ================================
 
 .. [4] Depends on whether there is a complex part or not.
diff --git a/doc/source/hdf5storage.Marshallers.rst b/doc/source/hdf5storage.Marshallers.rst
index 9b27d7f..b2329b1 100644
--- a/doc/source/hdf5storage.Marshallers.rst
+++ b/doc/source/hdf5storage.Marshallers.rst
@@ -52,7 +52,7 @@ NumpyScalarArrayMarshaller
                       np.int8, np.int16, np.int32, np.int64,
                       np.float16, np.float32, np.float64,
                       np.complex64, np.complex128,
-                      np.bytes_, np.str_]
+                      np.bytes_, np.str_, np.object_]
 
    .. autoinstanceattribute:: NumpyScalarArrayMarshaller.cpython_type_strings
       :annotation: = ['numpy.ndarray', 'numpy.matrix',
@@ -61,12 +61,12 @@ NumpyScalarArrayMarshaller
 		      'numpy.int16', 'numpy.int32', 'numpy.int64',
                       'numpy.float16', 'numpy.float32', 'numpy.float64',
                       'numpy.complex64', 'numpy.complex128',
-                      'numpy.bytes_', 'numpy.str_']
+                      'numpy.bytes_', 'numpy.str_', 'numpy.object_']
 
    .. autoinstanceattribute:: NumpyScalarArrayMarshaller.matlab_classes
       :annotation: = ['logical', 'char', 'single', 'double', 'uint8',
 	              'uint16', 'uint32', 'uint64', 'int8', 'int16',
-                      'int32', 'int64']
+                      'int32', 'int64', 'cell']
 
 
 PythonScalarMarshaller
diff --git a/doc/source/hdf5storage.utilities.rst b/doc/source/hdf5storage.utilities.rst
index 5f83b08..d129080 100644
--- a/doc/source/hdf5storage.utilities.rst
+++ b/doc/source/hdf5storage.utilities.rst
@@ -6,6 +6,12 @@ hdf5storage.utilities
 .. automodule:: hdf5storage.utilities
 
 
+next_unused_name_in_group
+-------------------------
+
+.. autofunction:: next_unused_name_in_group
+
+
 decode_to_str
 -------------
 
diff --git a/hdf5storage/Marshallers.py b/hdf5storage/Marshallers.py
index 97a5c38..4b85289 100644
--- a/hdf5storage/Marshallers.py
+++ b/hdf5storage/Marshallers.py
@@ -28,6 +28,8 @@
 
 """
 
+import posixpath
+
 import numpy as np
 import h5py
 
@@ -312,7 +314,7 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
                       np.int8, np.int16, np.int32, np.int64,
                       np.float16, np.float32, np.float64,
                       np.complex64, np.complex128,
-                      np.bytes_, np.str_]
+                      np.bytes_, np.str_, np.object_]
         self.cpython_type_strings = ['numpy.ndarray', 'numpy.matrix',
                                      'numpy.bool_',
                                      'numpy.uint8', 'numpy.uint16',
@@ -323,7 +325,8 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
                                      'numpy.float64',
                                      'numpy.complex64',
                                      'numpy.complex128',
-                                     'numpy.bytes_', 'numpy.str_']
+                                     'numpy.bytes_', 'numpy.str_',
+                                     'numpy.object_']
 
         # If we are storing in MATLAB format, we will need to be able to
         # set the MATLAB_class attribute. The different numpy types just
@@ -344,7 +347,8 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
                                  np.complex64: 'single',
                                  np.complex128: 'double',
                                  np.bytes_: 'char',
-                                 np.str_: 'char'}
+                                 np.str_: 'char',
+                                 np.object_: 'cell'}
 
         # Make a dict to look up the opposite direction (given a matlab
         # class, what numpy type to use.
@@ -360,7 +364,8 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
                                          'int64': np.int64,
                                          'single': np.float32,
                                          'double': np.float64,
-                                         'char': np.str_}
+                                         'char': np.str_,
+                                         'cell': np.object_}
 
 
         # Set matlab_classes to the supported classes (the values).
@@ -428,6 +433,46 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
             data_to_store = encode_complex(data_to_store,
                                            options.complex_names)
 
+        # If we are storing an object type and it isn't empty
+        # (data_to_store is still an object), then we must recursively
+        # write what each element points to and make an array of the
+        # references to them.
+
+        if data_to_store.dtype.name == 'object':
+            ref_dtype = h5py.special_dtype(ref=h5py.Reference)
+            data_refs = data_to_store.copy()
+
+            # Go through all the elements of data and write them,
+            # gabbing their references and putting them in
+            # data_refs. They will be put in group_for_references, which
+            # is also what the H5PATH needs to be set to if we are doing
+            # MATLAB compatibility (otherwise, the attribute needs to be
+            # deleted).
+
+            if options.group_for_references not in f:
+                f.create_group(options.group_for_references)
+
+            grp2 = f[options.group_for_references]
+
+            if not isinstance(grp2, h5py.Group):
+                del f[options.group_for_references]
+                grp2 = f[options.group_for_references]
+
+            for index, x in np.ndenumerate(data_to_store):
+                data_refs[index] = None
+                name_for_ref = next_unused_name_in_group(grp2, 16)
+                write_data(f, grp2, name_for_ref, x, None, options)
+                data_refs[index] = grp2[name_for_ref].ref
+                if options.MATLAB_compatible:
+                    set_attribute_string(grp2[name_for_ref],
+                                         'H5PATH', grp2.name)
+                else:
+                    del_attribute(grp2[k], 'H5PATH')
+
+            # Now, the dtype needs to be changed to the reference type
+            # and the whole thing copied over to data_to_store.
+            data_to_store = data_refs.astype(dtype=ref_dtype)
+
         # The data must first be written. If name is not present yet,
         # then it must be created. If it is present, but not a Dataset,
         # has the wrong dtype, or is the wrong shape; then it must be
@@ -552,6 +597,27 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
         data = grp[name][...]
         dt = data.dtype
 
+        # If it is a reference type, then we need to make an object
+        # array that is its replicate, but with the objects they are
+        # pointing to in their elements instead of just the references.
+        if h5py.check_dtype(ref=grp[name].dtype) is not None:
+            data_derefed = data.copy().astype(np.dtype('object'))
+
+            # Go through all the elements of data and read them using
+            # their references, and the putting the output in
+            # data_derefed. If they can't be read, None is put in.
+
+            for index, x in np.ndenumerate(data):
+                data_derefed[index] = None
+                try:
+                    data_derefed[index] = read_data(f, f[x].parent, \
+                        posixpath.basename(f[x].name), options)
+                except:
+                    raise
+
+            # Now all that needs to be done is copy back to data.
+            data = data_derefed.copy()
+
         # If metadata is present, that can be used to do convert to the
         # desired/closest Python data types. If none is present, or not
         # enough of it, then no conversions can be done.
@@ -885,6 +951,9 @@ class PythonDictMarshaller(TypeMarshaller):
         # them (nothing needs to be done).
         data = dict()
         for k in grp[name]:
+            # We must exclude group_for_references
+            if grp[name][k].name == options.group_for_references:
+                continue
             try:
                 data[k] = read_data(f, grp[name], k, options)
             except:
diff --git a/hdf5storage/__init__.py b/hdf5storage/__init__.py
index a6b4a0e..349cf62 100644
--- a/hdf5storage/__init__.py
+++ b/hdf5storage/__init__.py
@@ -207,6 +207,7 @@ class Options(object):
     reverse_dimension_order    ``True``
     store_shape_for_empty      ``True``
     complex_names              ``('real', 'imag')``
+    group_for_references       ``'/#refs#'``
     =========================  ====================
 
     In addition to setting these options, a specially formatted block of
@@ -233,6 +234,8 @@ class Options(object):
         See Attributes.
     complex_names : tuple of two str, optional
         See Attributes.
+    group_for_references : str, optional
+        See Attributes.
     marshaller_collection : MarshallerCollection, optional
         See Attributes.
 
@@ -247,6 +250,7 @@ class Options(object):
     reverse_dimension_order : bool
     store_shape_for_empty : bool
     complex_names : tuple of two str
+    group_for_references : str
     scalar_options : dict
         ``h5py.Group.create_dataset`` options for writing scalars.
     array_options : dict
@@ -264,6 +268,7 @@ class Options(object):
                  reverse_dimension_order=False,
                  store_shape_for_empty=False,
                  complex_names=('r', 'i'),
+                 group_for_references="/#refs#",
                  marshaller_collection=None):
         # Set the defaults.
 
@@ -275,6 +280,7 @@ class Options(object):
         self._reverse_dimension_order = False
         self._store_shape_for_empty = False
         self._complex_names = ('r', 'i')
+        self._group_for_references = "/#refs#"
         self._MATLAB_compatible = True
 
         # Apply all the given options using the setters, making sure to
@@ -289,6 +295,7 @@ class Options(object):
         self.reverse_dimension_order = reverse_dimension_order
         self.store_shape_for_empty = store_shape_for_empty
         self.complex_names = complex_names
+        self.group_for_references = group_for_references
         self.MATLAB_compatible = MATLAB_compatible
 
         # Set the h5py options to use for writing scalars and arrays to
@@ -352,6 +359,7 @@ class Options(object):
         reverse_dimension_order    ``True``
         store_shape_for_empty      ``True``
         complex_names              ``('real', 'imag')``
+        group_for_references       ``'/#refs#'``
         =========================  ====================
 
         In addition to setting these options, a specially formatted
@@ -375,6 +383,7 @@ class Options(object):
                 self._reverse_dimension_order = True
                 self._store_shape_for_empty = True
                 self._complex_names = ('real', 'imag')
+                self._group_for_references = "/#refs#"
 
     @property
     def delete_unused_variables(self):
@@ -567,6 +576,34 @@ class Options(object):
         if self._complex_names != ('real', 'imag'):
             self._MATLAB_compatible = False
 
+    @property
+    def group_for_references(self):
+        """ Path for where to put objects pointed at by references.
+
+        str
+
+        The absolute POSIX path for the Group to place all data that is
+        pointed to by another piece of data (needed for
+        ``numpy.object_`` and similar types). This path is automatically
+        excluded from its parent group when reading back a ``dict``.
+
+        Must be ``'/#refs#`` if doing MATLAB compatibility.
+
+        """
+        return self._group_for_references
+
+    @group_for_references.setter
+    def group_for_references(self, value):
+        # Check that it an str and a valid absolute POSIX path, and then
+        # set it. If it is something other than "/#refs#", then we are
+        # not doing MATLAB compatible formatting.
+        if isinstance(value, str):
+            pth = posixpath.normpath(value)
+            if len(pth) > 1 and posixpath.isabs(pth):
+                self._group_for_references = value
+        if self._group_for_references != "/#refs#":
+            self._MATLAB_compatible = False
+
 
 class MarshallerCollection(object):
     """ Represents, maintains, and retreives a set of marshallers.
diff --git a/hdf5storage/utilities.py b/hdf5storage/utilities.py
index d4c7a32..5a880f6 100644
--- a/hdf5storage/utilities.py
+++ b/hdf5storage/utilities.py
@@ -28,10 +28,43 @@
 
 """
 
+import string
+import random
+
 import numpy as np
 import h5py
 
 
+def next_unused_name_in_group(grp, length):
+    """ Gives a name that isn't used in a Group.
+
+    Generates a name of the desired length that is not a Dataset or
+    Group in the given group. Note, if length is not large enough and
+    `grp` is full enough, there may be no available names meaning that
+    this function will hang.
+
+    Parameters
+    ----------
+    grp : h5py.Group or h5py.File
+        The HDF5 Group (or File if at '/') to generate an unused name
+        in.
+    length : int
+        Number of characters the name should be.
+
+    Returns
+    -------
+    str
+        A name that isn't already an existing Dataset or Group in
+        `grp`.
+
+    """
+    ltrs = string.ascii_letters + string.digits
+    existing_names = set(grp.keys())
+    while True:
+        name = ''.join([random.choice(ltrs) for i in range(0, length)])
+        if name not in existing_names:
+            return name
+
 def decode_to_str(data):
     """ Decodes data to the Python str type.
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-hdf5storage.git