[python-hdf5storage] 69/84: Added compression to Options and the Numpy marshaller. (cherry picked from commit b63e610bea6cc66eeaa350ca1a9f127f07cb6631)
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Mon Feb 29 08:25:05 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to annotated tag 0.1.10
in repository python-hdf5storage.
commit 509ac3716560190c76570549ceff52092fe7d010
Author: Freja Nordsiek <fnordsie at gmail.com>
Date: Mon Aug 17 00:44:52 2015 -0400
Added compression to Options and the Numpy marshaller.
(cherry picked from commit b63e610bea6cc66eeaa350ca1a9f127f07cb6631)
Conflicts:
hdf5storage/Marshallers.py
hdf5storage/__init__.py
---
hdf5storage/Marshallers.py | 60 +++++++++--
hdf5storage/__init__.py | 247 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 298 insertions(+), 9 deletions(-)
diff --git a/hdf5storage/Marshallers.py b/hdf5storage/Marshallers.py
index 70b9504..b64ec14 100644
--- a/hdf5storage/Marshallers.py
+++ b/hdf5storage/Marshallers.py
@@ -784,23 +784,65 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
+ 'in the field names of this type of ' \
+ 'numpy.ndarray are not supported.')
+ # Set the storage options such as compression, chunking,
+ # filters, etc. If the data is being compressed (compression
+ # is enabled and the data is bigger than the threshold),
+ # turn on compression, set the algorithm, set the
+ # compression level, and enable the shuffle and fletcher32
+ # filters appropriately. If the data is not being
+ # compressed, turn on the fletcher32 filter if
+ # indicated. Compression should not be done for scalars.
+ filters = dict()
+ is_scalar = (data_to_store.shape != tuple())
+ if is_scalar and options.compress \
+ and data_to_store.nbytes \
+ >= options.compress_size_threshold:
+ filters['compression'] = \
+ options.compression_algorithm
+ if filters['compression'] == 'gzip':
+ filters['compression_opts'] = \
+ options.gzip_compression_level
+ filters['shuffle'] = options.shuffle_filter
+ filters['fletcher32'] = \
+ options.compressed_fletcher32_filter
+ else:
+ filters['compression'] = None
+ filters['shuffle'] = False
+ filters['compression_opts'] = None
+ if is_scalar:
+ filters['fletcher32'] = \
+ options.uncompressed_fletcher32_filter
+ else:
+ filters['fletcher32'] = False
+
+ # Set the chunking to auto if it is being chuncked
+ # (compressed or using the fletcher32 filter).
+ if filters['compression'] is not None \
+ or filters['fletcher32']:
+ filters['chunks'] = True
+ else:
+ filters['chunks'] = None
+
# The data must first be written. If name is not present
# yet, then it must be created. If it is present, but not a
- # Dataset, has the wrong dtype, or is the wrong shape; then
- # it must be deleted and then written. Otherwise, it is just
- # overwritten in place (note, this will not change any
- # filters or chunking settings, but will keep the file from
- # growing needlessly).
-
+ # Dataset, has the wrong dtype, is the wrong shape, doesn't
+ # use the same compression, or doesn't use the same filters;
+ # then it must be deleted and then written. Otherwise, it is
+ # just overwritten in place.
if name not in grp:
grp.create_dataset(name, data=data_to_store,
- **options.array_options)
+ **filters)
elif not isinstance(grp[name], h5py.Dataset) \
or grp[name].dtype != data_to_store.dtype \
- or grp[name].shape != data_to_store.shape:
+ or grp[name].shape != data_to_store.shape \
+ or grp[name].compression != filters['compression'] \
+ or grp[name].shuffle != filters['shuffle'] \
+ or grp[name].fletcher32 != filters['fletcher32'] \
+ or grp[name].compression_opts != \
+ filters['compression_opts']:
del grp[name]
grp.create_dataset(name, data=data_to_store,
- **options.array_options)
+ **filters)
else:
grp[name][...] = data_to_store
diff --git a/hdf5storage/__init__.py b/hdf5storage/__init__.py
index 02ec846..cd0fca0 100644
--- a/hdf5storage/__init__.py
+++ b/hdf5storage/__init__.py
@@ -72,6 +72,7 @@ class Options(object):
store_shape_for_empty ``True``
complex_names ``('real', 'imag')``
group_for_references ``'/#refs#'``
+ compression_algorithm ``'gzip'``
================================== ====================
In addition to setting these options, a specially formatted block of
@@ -109,6 +110,20 @@ class Options(object):
See Attributes.
oned_as : str, optional
See Attributes.
+ compress : bool, optional
+ See Attributes.
+ compress_size_threshold : int, optional
+ See Attributes.
+ compression_algorithm : str, optional
+ See Attributes.
+ gzip_compression_level : int, optional
+ See Attributes.
+ shuffle_filter : bool, optional
+ See Attributes.
+ compressed_fletcher32_filter : bool, optional
+ See Attributes.
+ uncompressed_fletcher32_filter : bool, optional
+ See Attributes.
marshaller_collection : MarshallerCollection, optional
See Attributes.
**keywords :
@@ -132,6 +147,13 @@ class Options(object):
complex_names : tuple of two str
group_for_references : str
oned_as : {'row', 'column'}
+ compress : bool
+ compress_size_threshold : int
+ compression_algorithm : {'gzip', 'lzf', 'szip'}
+ gzip_compression_level : int
+ shuffle_filter : bool
+ compressed_fletcher32_filter : bool
+ uncompressed_fletcher32_filter : bool
scalar_options : dict
``h5py.Group.create_dataset`` options for writing scalars.
array_options : dict
@@ -154,8 +176,19 @@ class Options(object):
complex_names=('r', 'i'),
group_for_references="/#refs#",
oned_as='row',
+<<<<<<< HEAD
marshaller_collection=None,
**keywords):
+=======
+ compress=True,
+ compress_size_threshold=16*1024,
+ compression_algorithm='gzip',
+ gzip_compression_level=7,
+ shuffle_filter=True,
+ compressed_fletcher32_filter=True,
+ uncompressed_fletcher32_filter=False,
+ marshaller_collection=None):
+>>>>>>> b63e610... Added compression to Options and the Numpy marshaller.
# Set the defaults.
self._store_python_metadata = True
@@ -171,6 +204,13 @@ class Options(object):
self._complex_names = ('r', 'i')
self._group_for_references = "/#refs#"
self._oned_as = 'row'
+ self._compress = True
+ self._compress_size_threshold = 16*1024
+ self._compression_algorithm = 'gzip'
+ self._gzip_compression_level = 7
+ self._shuffle_filter = True
+ self._compressed_fletcher32_filter = True
+ self._uncompressed_fletcher32_filter = False
self._matlab_compatible = True
# Apply all the given options using the setters, making sure to
@@ -192,6 +232,14 @@ class Options(object):
self.complex_names = complex_names
self.group_for_references = group_for_references
self.oned_as = oned_as
+ self.compress = compress
+ self.compress_size_threshold = compress_size_threshold
+ self.compression_algorithm = compression_algorithm
+ self.gzip_compression_level = gzip_compression_level
+ self.shuffle_filter = shuffle_filter
+ self.compressed_fletcher32_filter = compressed_fletcher32_filter
+ self.uncompressed_fletcher32_filter = \
+ uncompressed_fletcher32_filter
self.matlab_compatible = matlab_compatible
# Set the h5py options to use for writing scalars and arrays to
@@ -258,6 +306,7 @@ class Options(object):
store_shape_for_empty ``True``
complex_names ``('real', 'imag')``
group_for_references ``'/#refs#'``
+ compression_algorithm ``'gzip'``
================================== ====================
In addition to setting these options, a specially formatted
@@ -284,6 +333,7 @@ class Options(object):
self._store_shape_for_empty = True
self._complex_names = ('real', 'imag')
self._group_for_references = "/#refs#"
+ self._compression_algorithm = 'gzip'
@property
def action_for_matlab_incompatible(self):
@@ -621,6 +671,203 @@ class Options(object):
if value in ('row', 'column'):
self._oned_as = value
+ @property
+ def compress(self):
+ """ Whether to compress large python objects (datasets).
+
+ bool
+
+ If ``True``, python objects (datasets) larger than
+ ``compress_size_threshold`` will be compressed.
+
+ See Also
+ --------
+ compress_size_threshold
+ compression_algorithm
+ shuffle_filter
+ compressed_fletcher32_filter
+
+ """
+ return self._compress
+
+ @compress.setter
+ def compress(self, value):
+ # Check that it is a bool, and then set it.
+ if isinstance(value, bool):
+ self._compress = value
+
+ @property
+ def compress_size_threshold(self):
+ """ Minimum size of a python object before it is compressed.
+
+ int
+
+ Minimum size in bytes a python object must be for it to be
+ compressed if ``compress`` is set. Must be non-negative.
+
+ See Also
+ --------
+ compress
+
+ """
+ return self._compress_size_threshold
+
+ @compress_size_threshold.setter
+ def compress_size_threshold(self, value):
+ # Check that it is a non-negative integer, and then set it.
+ if isinstance(value, int) and value >= 0:
+ self._compress_size_threshold = value
+
+ @property
+ def compression_algorithm(self):
+ """ Algorithm to use for compression.
+
+ {'gzip', 'lzf', 'szip'}
+
+ Compression algorithm to use When the ``compress`` option is set
+ and a python object is larger than ``compress_size_threshold``.
+ ``'gzip'`` is the only MATLAB compatible option.
+
+ ``'gzip'`` is also known as the Deflate algorithm, which is the
+ default compression algorithm of ZIP files and is a common
+ compression algorithm used on tarballs. It is the most
+ compatible option. It has good compression and is reasonably
+ fast. Its compression level is set with the
+ ``gzip_compression_level`` option, which is an integer between 0
+ and 9 inclusive.
+
+ ``'lzf'`` is a very fast but low to moderate compression
+ algorithm. It is less commonly used than gzip/Deflate, but
+ doesn't have any patent or license issues.
+
+ ``'szip'`` is a compression algorithm that has some patents and
+ license restrictions. It is not always available.
+
+ See Also
+ --------
+ compress
+ compress_size_threshold
+ h5py.Group.create_dataset
+ http://www.hdfgroup.org/doc_resource/SZIP/Commercial_szip.html
+
+ """
+ return self._compression_algorithm
+
+ @compression_algorithm.setter
+ def compression_algorithm(self, value):
+ # Check that it is one of the valid values before setting it. If
+ # it is something other than 'gzip', then we are not doing
+ # MATLAB compatible formatting.
+ if value in ('gzip', 'lzf', 'szip'):
+ self._compression_algorithm = value
+ if self._compression_algorithm != 'gzip':
+ self._matlab_compatible = False
+
+ @property
+ def gzip_compression_level(self):
+ """ The compression level to use when doing the gzip algorithm.
+
+ int
+
+ Compression level to use when data is being compressed with the
+ ``'gzip'`` algorithm. Must be an integer between 0 and 9
+ inclusive. Lower values are faster while higher values give
+ better compression.
+
+ See Also
+ --------
+ compress
+ compression_algorithm
+
+ """
+ return self._gzip_compression_level
+
+ @gzip_compression_level.setter
+ def gzip_compression_level(self, value):
+ # Check that it is an integer between 0 and 9.
+ if isinstance(value, int) and value >= 0 and value <= 9:
+ self._gzip_compression_level = value
+
+ @property
+ def shuffle_filter(self):
+ """ Whether to use the shuffle filter on compressed python objects.
+
+ bool
+
+ If ``True``, python objects (datasets) that are compressed are
+ run through the shuffle filter, which reversibly rearranges the
+ data to improve compression.
+
+ See Also
+ --------
+ compress
+ h5py.Group.create_dataset
+
+ """
+ return self._shuffle_filter
+
+ @shuffle_filter.setter
+ def shuffle_filter(self, value):
+ # Check that it is a bool, and then set it.
+ if isinstance(value, bool):
+ self._shuffle_filter = value
+
+ @property
+ def compressed_fletcher32_filter(self):
+ """ Whether to use the fletcher32 filter on compressed python objects.
+
+ bool
+
+ If ``True``, python objects (datasets) that are compressed are
+ run through the fletcher32 filter, which stores a checksum with
+ each chunk so that data corruption can be more easily detected.
+
+ See Also
+ --------
+ compress
+ shuffle_filter
+ uncompressed_flether32_filter
+ h5py.Group.create_dataset
+
+ """
+ return self._compressed_fletcher32_filter
+
+ @compressed_fletcher32_filter.setter
+ def compressed_fletcher32_filter(self, value):
+ # Check that it is a bool, and then set it.
+ if isinstance(value, bool):
+ self._compressed_fletcher32_filter = value
+
+ @property
+ def uncompressed_fletcher32_filter(self):
+ """ Whether to use the fletcher32 filter on uncompressed non-scalar python objects.
+
+ bool
+
+ If ``True``, python objects (datasets) that are **NOT**
+ compressed and are not scalars (when converted to a Numpy type,
+ their shape is not an empty ``tuple``) are run through the
+ fletcher32 filter, which stores a checksum with each chunk so
+ that data corruption can be more easily detected. This forces
+ all uncompressed data to be chuncked regardless of how small and
+ can increase file sizes.
+
+ See Also
+ --------
+ compress
+ shuffle_filter
+ compressed_flether32_filter
+ h5py.Group.create_dataset
+
+ """
+ return self._uncompressed_fletcher32_filter
+
+ @uncompressed_fletcher32_filter.setter
+ def uncompressed_fletcher32_filter(self, value):
+ # Check that it is a bool, and then set it.
+ if isinstance(value, bool):
+ self._uncompressed_fletcher32_filter = value
+
class MarshallerCollection(object):
""" Represents, maintains, and retreives a set of marshallers.
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-hdf5storage.git
More information about the debian-science-commits
mailing list