[python-hdf5storage] 85/152: Added an option to optionally convert numpy.str_ to np.uint16 in UTF-16 format and changed it so that str is converted to numpy.str_ instead of numpy.bytes_.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Feb 29 08:24:37 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to annotated tag 0.1
in repository python-hdf5storage.
commit ec11484243f8b33e26de2d1c27d4a1f392806c2b
Author: Freja Nordsiek <fnordsie at gmail.com>
Date: Sun Feb 2 22:24:06 2014 -0500
Added an option to optionally convert numpy.str_ to np.uint16 in UTF-16 format and changed it so that str is converted to numpy.str_ instead of numpy.bytes_.
---
README.rst | 88 +++++++++++++++++++---------------
hdf5storage/Marshallers.py | 46 +++++++++++-------
hdf5storage/__init__.py | 50 ++++++++++++++++++-
hdf5storage/utilities.py | 117 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 244 insertions(+), 57 deletions(-)
diff --git a/README.rst b/README.rst
index 4584b14..a4604bd 100644
--- a/README.rst
+++ b/README.rst
@@ -62,40 +62,52 @@ will be what it is read back as) the MATLAB class it becomes if
targetting a MAT file, and the first version of this package to
support writing it so MATlAB can read it.
-============= ======= ================== ======= ========
-Python MATLAB
------------------------------------------- -----------------
-Type Version Converted to Class Version
-============= ======= ================== ======= ========
-bool 0.1 np.bool\_ logical 0.1
-None 0.1 ``np.float64([])`` ``[]`` 0.1
-int 0.1 np.int64 int64 0.1
-float 0.1 np.float64 double 0.1
-complex 0.1 np.complex128 double 0.1
-str 0.1 np.bytes\_ char 0.1 [1]_
-bytes 0.1 np.bytes\_ char 0.1
-bytearray 0.1 np.bytes\_ char 0.1
-np.bool\_ 0.1 logical 0.1
-np.uint8 0.1 uint8 0.1
+============= ======= ==================== =========== ========
+Python MATLAB
+-------------------------------------------- ---------------------
+Type Version Converted to Class Version
+============= ======= ==================== =========== ========
+bool 0.1 np.bool\_/np.uint8 logical 0.1 [1]_
+None 0.1 ``np.float64([])`` ``[]`` 0.1
+int 0.1 np.int64 int64 0.1
+float 0.1 np.float64 double 0.1
+complex 0.1 np.complex128 double 0.1
+str 0.1 np.uint32/16 char 0.1 [2]_
+bytes 0.1 np.bytes\_/np.uint16 char 0.1 [3]_
+bytearray 0.1 np.bytes\_/np.uint16 char 0.1 [3]_
+np.bool\_ 0.1 logical 0.1
+np.uint8 0.1 uint8 0.1
np.float16 0.1
-np.float32 0.1 single 0.1
-np.float64 0.1 double 0.1
-np.complex64 0.1 single 0.1
-np.complex128 0.1 double 0.1
-np.str\_ 0.1 np.uint32 uint32 0.1 [2]_
-np.bytes\_ 0.1 char 0.1
-np.object\_ 0.1 cell 0.1
-dict 0.1 struct 0.1 [3]_
-list 0.1 np.object\_ cell 0.1
-tuple 0.1 np.object\_ cell 0.1
-set 0.1 np.object\_ cell 0.1
-frozenset 0.1 np.object\_ cell 0.1
-cl.deque 0.1 np.object\_ cell 0.1
-============= ======= ================== ======= ========
-
-.. [1] Converted to ASCII, so characters outside of that set are lost.
-.. [2] Simply copied over as the uint32 versions of each UTF-32 character.
-.. [3] All keys must be ``str``.
+np.float32 0.1 single 0.1
+np.float64 0.1 double 0.1
+np.complex64 0.1 single 0.1
+np.complex128 0.1 double 0.1
+np.str\_ 0.1 np.uint32/16 char/uint32 0.1 [2]_
+np.bytes\_ 0.1 np.bytes\_/np.uint16 char 0.1 [3]_
+np.object\_ 0.1 cell 0.1
+dict 0.1 struct 0.1 [4]_
+list 0.1 np.object\_ cell 0.1
+tuple 0.1 np.object\_ cell 0.1
+set 0.1 np.object\_ cell 0.1
+frozenset 0.1 np.object\_ cell 0.1
+cl.deque 0.1 np.object\_ cell 0.1
+============= ======= ==================== =========== ========
+
+.. [1] Depends on the selected options. Always ``np.uint8`` when doing
+ MATLAB compatiblity, or if the option is explicitly set.
+.. [2] Depends on the selected options and whether it can be converted
+ to UTF-16 without using doublets. If the option is explicity set
+ (or implicitly through doing MATLAB compatibility) and it can be
+ converted to UTF-16 without losing any characters that can't be
+ represented in UTF-16 or using UTF-16 doublets (MATLAB doesn't
+ support them), then it is written as ``np.uint16`` in UTF-16
+ encoding. Otherwise, it is stored at ``np.uint32`` in UTF-32
+ encoding.
+.. [3] Depends on the selected options. If the option is explicitly set
+ (or implicitly through doing MATLAB compatibility), it will be
+ stored as ``np.uint16`` in UTF-16 encoding. Otherwise, it is just
+ written as ``np.bytes_``.
+.. [4] All keys must be ``str``.
This table gives the MATLAB classes that can be read from a MAT file,
the first version of this package that can read them, and the Python
@@ -105,8 +117,8 @@ type they are read as.
MATLAB Class Version Python Type
============ ======= ================================
logical 0.1 np.bool\_
-single 0.1 np.float32 or np.complex64 [4]_
-double 0.1 np.float64 or np.complex128 [4]_
+single 0.1 np.float32 or np.complex64 [5]_
+double 0.1 np.float64 or np.complex128 [5]_
uint8 0.1 np.uint8
uint16 0.1 np.uint16
uint32 0.1 np.uint32
@@ -115,9 +127,9 @@ int8 0.1 np.int8
int16 0.1 np.int16
int32 0.1 np.int32
int64 0.1 np.int64
-struct 0.1 dict [5]_
+struct 0.1 dict [6]_
cell 0.1 np.object\_
============ ======= ================================
-.. [4] Depends on whether there is a complex part or not.
-.. [5] Structure arrays are not supported.
+.. [5] Depends on whether there is a complex part or not.
+.. [6] Structure arrays are not supported.
diff --git a/hdf5storage/Marshallers.py b/hdf5storage/Marshallers.py
index caf6ce1..be109e4 100644
--- a/hdf5storage/Marshallers.py
+++ b/hdf5storage/Marshallers.py
@@ -390,21 +390,33 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
data_to_store = np.uint16(np.atleast_1d( \
data_to_store).view(np.uint8))
- # As of 2013-12-13, h5py cannot write numpy.unicode (UTF-32
- # encoding) types. If it is just a numpy.unicode object, we can
- # force it to UTF-16 or just write it as uint32's. If it is an
- # array, forcing it to UTF-16 is a bad idea because characters
- # are not always 2 bytes long in UTF-16. So, converting them to
- # uint32 makes the most sense.
+ # As of 2013-12-13, h5py cannot write numpy.str_ (UTF-32
+ # encoding) types. If the option is set to try to convert them
+ # to UTF-16, then an attempt at the conversion is made. If no
+ # conversion is to be done, the conversion throws an exception
+ # (a UTF-32 character had no UTF-16 equivalent), or a UTF-32
+ # character gets turned into a UTF-16 doublet (the increase in
+ # the number of columns will be by a factor more than the length
+ # of the strings); then it will be simply converted to uint32's
+ # byte for byte instead.
if data.dtype.type == np.str_:
- if data_to_store.nbytes == 0:
- data_to_store = np.uint32([])
+ new_data = None
+ if options.convert_numpy_str_to_utf16:
+ try:
+ new_data = convert_numpy_str_to_uint16( \
+ data_to_store)
+ except:
+ pass
+ if new_data is None or (type(data_to_store) == np.str_ \
+ and len(data_to_store) == len(new_data)) \
+ or (isinstance(data_to_store, np.ndarray) \
+ and new_data.shape[-1] != data_to_store.shape[-1] \
+ * (data_to_store.dtype.itemsize//4)):
+ data_to_store = convert_numpy_str_to_uint32( \
+ data_to_store)
else:
- shape = list(np.atleast_1d(data_to_store).shape)
- shape[-1] *= data_to_store.dtype.itemsize//4
- data_to_store = data_to_store.flatten().view(np.uint32)
- data_to_store = data_to_store.reshape(tuple(shape))
+ data_to_store = new_data
# Convert scalars to arrays if that option is set.
@@ -552,9 +564,10 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
# If we are making it MATLAB compatible, the MATLAB_class
# attribute needs to be set looking up the data type (gotten
- # using np.dtype.type) and if it is a string or bool type, then
- # the MATLAB_int_decode attribute must be set
- # properly. Otherwise, the attributes must be deleted.
+ # using np.dtype.type). If it is a string or bool type, then
+ # the MATLAB_int_decode attribute must be set to the number of
+ # bytes each element takes up (dtype.itemsize). Otherwise,
+ # the attributes must be deleted.
if options.matlab_compatible:
tp = data.dtype.type
@@ -566,8 +579,7 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
if tp in (np.bytes_, np.str_, np.bool_):
set_attribute(grp[name], 'MATLAB_int_decode', np.int64(
- {np.bool_: 1, np.bytes_: 2,
- np.str_: 4}[tp]))
+ grp[name].dtype.itemsize))
else:
del_attribute(grp[name], 'MATLAB_int_decode')
diff --git a/hdf5storage/__init__.py b/hdf5storage/__init__.py
index de228b1..8b3cec5 100644
--- a/hdf5storage/__init__.py
+++ b/hdf5storage/__init__.py
@@ -203,6 +203,7 @@ class Options(object):
delete_unused_variables ``True``
convert_scalars_to_arrays ``True``
convert_numpy_bytes_to_utf16 ``True``
+ convert_numpy_str_to_utf16 ``True``
convert_bools_to_uint8 ``True``
reverse_dimension_order ``True``
store_shape_for_empty ``True``
@@ -226,6 +227,8 @@ class Options(object):
See Attributes.
convert_numpy_bytes_to_utf16 : bool, optional
See Attributes.
+ convert_numpy_str_to_utf16 : bool, optional
+ See Attributes.
convert_bools_to_uint8 : bool, optional
See Attributes.
reverse_dimension_order : bool, optional
@@ -246,6 +249,7 @@ class Options(object):
delete_unused_variables : bool
convert_scalars_to_arrays : bool
convert_numpy_bytes_to_utf16 : bool
+ convert_numpy_str_to_utf16 : bool
convert_bools_to_uint8 : bool
reverse_dimension_order : bool
store_shape_for_empty : bool
@@ -264,6 +268,7 @@ class Options(object):
delete_unused_variables=False,
convert_scalars_to_arrays=False,
convert_numpy_bytes_to_utf16=False,
+ convert_numpy_str_to_utf16=False,
convert_bools_to_uint8=False,
reverse_dimension_order=False,
store_shape_for_empty=False,
@@ -276,6 +281,7 @@ class Options(object):
self._delete_unused_variables = False
self._convert_scalars_to_arrays = False
self._convert_numpy_bytes_to_utf16 = False
+ self._convert_numpy_str_to_utf16 = False
self._convert_bools_to_uint8 = False
self._reverse_dimension_order = False
self._store_shape_for_empty = False
@@ -291,6 +297,7 @@ class Options(object):
self.delete_unused_variables = delete_unused_variables
self.convert_scalars_to_arrays = convert_scalars_to_arrays
self.convert_numpy_bytes_to_utf16 = convert_numpy_bytes_to_utf16
+ self.convert_numpy_str_to_utf16 = convert_numpy_str_to_utf16
self.convert_bools_to_uint8 = convert_bools_to_uint8
self.reverse_dimension_order = reverse_dimension_order
self.store_shape_for_empty = store_shape_for_empty
@@ -355,6 +362,7 @@ class Options(object):
delete_unused_variables ``True``
convert_scalars_to_arrays ``True``
convert_numpy_bytes_to_utf16 ``True``
+ convert_numpy_str_to_utf16 ``True``
convert_bools_to_uint8 ``True``
reverse_dimension_order ``True``
store_shape_for_empty ``True``
@@ -379,6 +387,7 @@ class Options(object):
self._delete_unused_variables = True
self._convert_scalars_to_arrays = True
self._convert_numpy_bytes_to_utf16 = True
+ self._convert_numpy_str_to_utf16 = True
self._convert_bools_to_uint8 = True
self._reverse_dimension_order = True
self._store_shape_for_empty = True
@@ -442,8 +451,8 @@ class Options(object):
If ``True`` (defaults to ``False`` unless MATLAB compatibility
is being done), ``numpy.bytes_`` and anything that is converted
- to them (``str``, ``bytes``, and ``bytearray``) are converted to
- UTF-16 before being written to file as ``numpy.uint16``.
+ to them (``bytes``, and ``bytearray``) are converted to UTF-16
+ before being written to file as ``numpy.uint16``.
Must be ``True`` if doing MATLAB compatibility. MATLAB uses
UTF-16 for its strings.
@@ -451,6 +460,7 @@ class Options(object):
See Also
--------
numpy.bytes_
+ convert_numpy_str_to_utf16
"""
return self._convert_numpy_bytes_to_utf16
@@ -465,6 +475,42 @@ class Options(object):
self._matlab_compatible = False
@property
+ def convert_numpy_str_to_utf16(self):
+ """ Whether or not to convert numpy.str_ to UTF-16.
+
+ bool
+
+ If ``True`` (defaults to ``False`` unless MATLAB compatibility
+ is being done), ``numpy.str_`` and anything that is converted
+ to them (``str``) will be converted to UTF-16 if possible before
+ being written to file as ``numpy.uint16``. If doing so would
+ lead to a loss of data (character can't be translated to
+ UTF-16) or would change the shape of an array of ``numpy.str_``
+ due to a character being converted into a pair 2-bytes, the
+ conversion will not be made and the string will be stored in
+ UTF-32 form as a ``numpy.uint32``.
+
+ Must be ``True`` if doing MATLAB compatibility. MATLAB uses
+ UTF-16 for its strings.
+
+ See Also
+ --------
+ numpy.bytes_
+ convert_numpy_str_to_utf16
+
+ """
+ return self._convert_numpy_str_to_utf16
+
+ @convert_numpy_str_to_utf16.setter
+ def convert_numpy_str_to_utf16(self, value):
+ # Check that it is a bool, and then set it. If it is false, we
+ # are not doing MATLAB compatible formatting.
+ if isinstance(value, bool):
+ self._convert_numpy_str_to_utf16 = value
+ if not self._convert_numpy_str_to_utf16:
+ self._matlab_compatible = False
+
+ @property
def convert_bools_to_uint8(self):
""" Whether or not to convert bools to ``numpy.uint8``.
diff --git a/hdf5storage/utilities.py b/hdf5storage/utilities.py
index 3ddb125..d9d635e 100644
--- a/hdf5storage/utilities.py
+++ b/hdf5storage/utilities.py
@@ -65,6 +65,123 @@ def next_unused_name_in_group(grp, length):
if name not in existing_names:
return name
+def convert_numpy_str_to_uint16(data):
+ """ Converts a numpy.str_ to UTF-16 encoding in numpy.uint16 form.
+
+ Convert a ``numpy.str`` or an array of them (they are UTF-32
+ strings) to UTF-16 in the equivalent array of ``numpy.uint16``. The
+ conversion will throw an exception if any characters cannot be
+ converted to UTF-16. Strings are expanded along rows (across columns)
+ so a 2x3x4 array of 10 element strings will get turned into a 2x30x4
+ array of uint16's if every UTF-32 character converts easily to a
+ UTF-16 singlet, as opposed to a UTF-16 doublet.
+
+ Parameters
+ ----------
+ data : numpy.str_ or numpy.ndarray of numpy.str_
+ The string or array of them to convert.
+
+ Returns
+ -------
+ numpy.ndarray of numpy.uint16
+ The result of the conversion.
+
+ Raises
+ ------
+ UnicodeEncodeError
+ If a UTF-32 character has no UTF-16 representation.
+
+ See Also
+ --------
+ convert_numpy_str_to_uint32
+ decode_to_numpy_unicode
+
+ """
+ # An empty string should be an empty uint16
+ if data.nbytes == 0:
+ return np.uint16([])
+
+ # If it is just a string instead of an array of them, then the
+ # string can simply be converted and returned as a 1d array pretty
+ # easily using ndarray's buffer option. The byte order mark, 2
+ # bytes, needs to be removed.
+ if not isinstance(data, np.ndarray):
+ s = data.encode(encoding='UTF-16', errors='strict')
+ return np.ndarray(shape=((len(s)-2)//2,), dtype='uint16',
+ buffer=s[2:])
+
+ # It is an array of strings. Each string in the array needs to be
+ # converted. An object array is needed to hold all the converted
+ # forms, as opposed to just constructing the final uint16 array,
+ # because the converted forms could end up greatly differing lengths
+ # depending on how many characters turn into doublets. The sizes of
+ # each one need to be grabbed along the way to be able to construct
+ # the final array. The easiest way to convert each string is to use
+ # recursion.
+ converted_strings = np.ndarray(shape=data.shape, dtype='object')
+ sizes = np.zeros(shape=data.shape, dtype='int64')
+
+ for index, x in np.ndenumerate(data):
+ converted_strings[index] = convert_numpy_str_to_uint16(x)
+ sizes[index] = np.prod(converted_strings[index].shape)
+
+ # The shape of the new array is simply the shape of the old one with
+ # the number of columns increased multiplicatively by the size of
+ # the largest UTF-16 string so that everything will fit.
+ length = np.max(sizes)
+ shape = list(data.shape)
+ shape[-1] *= length
+ new_data = np.zeros(shape=tuple(shape), dtype='uint16')
+
+ # Copy each string into new_data using clever indexing (using the
+ # first part of index returns a 1d subarray that can be
+ # addressed). Then, the conversion is done.
+ for index, x in np.ndenumerate(converted_strings):
+ new_data[index[:-1]][ \
+ (length*index[-1]):(length*index[-1]+sizes[index])] = x
+
+ return new_data
+
+def convert_numpy_str_to_uint32(data):
+ """ Converts a numpy.str_ to its numpy.uint32 representation.
+
+ Convert a ``numpy.str`` or an array of them (they are UTF-32
+ strings) into the equivalent array of ``numpy.uint32`` that is byte
+ for byte identical. Strings are expanded along rows (across columns)
+ so a 2x3x4 array of 10 element strings will get turned into a 2x30x4
+ array of uint32's.
+
+ Parameters
+ ----------
+ data : numpy.str_ or numpy.ndarray of numpy.str_
+ The string or array of them to convert.
+
+ Returns
+ -------
+ numpy.ndarray of numpy.uint32
+ The result of the conversion.
+
+ See Also
+ --------
+ convert_numpy_str_to_uint16
+ decode_to_numpy_unicode
+
+ """
+ if data.nbytes == 0:
+ # An empty string should be an empty uint32.
+ return np.uint32([])
+ else:
+ # We need to calculate the new shape from the current shape,
+ # which will have to be expanded along the rows to fit all the
+ # characters (the dtype.itemsize gets the number of bytes in
+ # each string, which is just 4 times the number of
+ # characters. Then it is a mstter of getting a view of the
+ # string (in flattened form so that it is contiguous) as uint32
+ # and then reshaping it.
+ shape = list(np.atleast_1d(data).shape)
+ shape[-1] *= data.dtype.itemsize//4
+ return data.flatten().view(np.uint32).reshape(tuple(shape))
+
def decode_to_str(data):
""" Decodes data to the Python str type.
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-hdf5storage.git
More information about the debian-science-commits
mailing list