[h5py] 179/455: Start redoing HL compression interface

Thu Jul 2 18:19:30 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to annotated tag 1.3.0
in repository h5py.

commit bffedbb29b3ae4d76db4136e244604f2f4b2f387
Author: andrewcollette <andrew.collette at gmail.com>
Date:   Wed Dec 10 03:49:18 2008 +0000

    Start redoing HL compression interface
---
 h5py/highlevel.py            |  83 +++++++++++++++---------------
 h5py/tests/test_highlevel.py |   5 ++
 h5py/utils_hl.py             | 120 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 165 insertions(+), 43 deletions(-)

diff --git a/h5py/highlevel.py b/h5py/highlevel.py
index 354274f..bece59d 100644
--- a/h5py/highlevel.py
+++ b/h5py/highlevel.py
@@ -53,6 +53,7 @@ import warnings
 
 from h5py import h5, h5f, h5g, h5s, h5t, h5d, h5a, h5p, h5z, h5i
 from h5py.h5 import H5Error
+import utils_hl as uhl
 from utils_hl import slice_select, hbasename, guess_chunk
 from utils_hl import CoordsList
 from browse import _H5Browser
@@ -614,31 +615,30 @@ class Dataset(HLObject):
     @property
     def chunks(self):
         """Dataset chunks (or None)"""
-        try:
-            return self._plist.get_chunk()
-        except H5Error:
-            return None
+        return self._chunks
 
     @property
     def compression(self):
-        """Compression level (or None)"""
-        filt = self._plist.get_filter_by_id(h5z.FILTER_DEFLATE)
-        if filt is not None:
-            return filt[1][0]
-        filt = self._plist.get_filter_by_id(h5z.FILTER_LZF)
-        if filt is not None:
-            return 'lzf'
+        """Compression strategy (or None)"""
+        for x in ('gzip','lzf','szip'):
+            if x in self._filters:
+                return x
         return None
 
     @property
+    def compression_opts(self):
+        """ Compression setting.  Int(0-9) for gzip, 2-tuple for szip. """
+        return self._filters.get(self.compression, None)
+
+    @property
     def shuffle(self):
         """Shuffle filter present (T/F)"""
-        return self._plist.get_filter_by_id(h5z.FILTER_SHUFFLE) is not None
+        return 'shuffle' in self._filters
 
     @property
     def fletcher32(self):
         """Fletcher32 filter is present (T/F)"""
-        return self._plist.get_filter_by_id(h5z.FILTER_FLETCHER32) is not None
+        return 'fletcher32' in self._filters
         
     @property
     def maxshape(self):
@@ -649,7 +649,7 @@ class Dataset(HLObject):
     def __init__(self, group, name,
                     shape=None, dtype=None, data=None,
                     chunks=None, compression=None, shuffle=False,
-                    fletcher32=False, maxshape=None):
+                    fletcher32=False, maxshape=None, compression_opts=None):
         """ Open or create a new dataset in the file.
 
         It's recommended you use the Group methods (open via Group["name"],
@@ -675,12 +675,14 @@ class Dataset(HLObject):
         Creation keywords (* is default):
 
         chunks:        Tuple of chunk dimensions, True, or None*
-        compression:   DEFLATE (gzip) compression level, int or None*
+        compression:   "gzip", "lzf", or "szip" (if available)
         shuffle:       Use the shuffle filter? (requires compression) T/F*
         fletcher32:    Enable Fletcher32 error detection? T/F*
         maxshape:      Tuple giving dataset maximum dimensions or None*.
                        You can grow each axis up to this limit using
                        resize().  For each unlimited axis, provide None.
+        
+        compress_opts: Optional setting for the compression filter
 
         All these options require chunking.  If a chunk tuple is not
         provided, the constructor will guess an appropriate chunk shape.
@@ -688,7 +690,7 @@ class Dataset(HLObject):
         """
         with group._lock:
             if data is None and shape is None:
-                if any((data,dtype,shape,chunks,compression,shuffle,fletcher32)):
+                if any((dtype,chunks,compression,shuffle,fletcher32)):
                     raise ValueError('You cannot specify keywords when opening a dataset.')
                 self.id = h5d.open(group.id, name)
             else:
@@ -702,7 +704,6 @@ class Dataset(HLObject):
                     if data is None:
                         raise TypeError("Either data or shape must be specified")
                     shape = data.shape
-
                 else:
                     shape = tuple(shape)
                     if data is not None and (numpy.product(shape) != numpy.product(data.shape)):
@@ -716,33 +717,26 @@ class Dataset(HLObject):
                 else:
                     dtype = numpy.dtype(dtype)
 
-                # Generate chunks if necessary
-                if any((compression, shuffle, fletcher32, maxshape)) or chunks is True:
+                # Legacy
+                if any((compression, shuffle, fletcher32, maxshape)):
                     if chunks is False:
                         raise ValueError("Chunked format required for given storage options")
-                    if chunks in (True, None):
-                        chunks = guess_chunk(shape, dtype.itemsize)
-
-                if chunks and shape == ():
-                    raise ValueError("Filter options cannot be used with scalar datasets.")
-
-                plist = h5p.create(h5p.DATASET_CREATE)
-                if chunks:
-                    plist.set_chunk(tuple(chunks))
-                    plist.set_fill_time(h5d.FILL_TIME_ALLOC)
-                if shuffle:
-                    plist.set_shuffle()
-                if compression:
-                    if compression is True:
-                        compression = 6
-                    if compression in range(10):
-                        plist.set_deflate(compression)
-                    elif compression == 'lzf':
-                        plist.set_filter(h5z.FILTER_LZF, h5z.FLAG_OPTIONAL)
+
+                # Legacy
+                if compression in range(10) or compression is True:
+                    if compression_opts is None:
+                        if compression is True:
+                            compression_opts = 4
+                        else:
+                            compression_opts = compression
                     else:
-                        raise ValueError('Compression must be 0-9 or "lzf"')
-                if fletcher32:
-                    plist.set_fletcher32()
+                        raise TypeError("Conflict in compression options")
+                    compression = 'gzip'
+
+                # Generate the dataset creation property list
+                # This also validates the keyword arguments
+                plist = uhl.generate_dcpl(shape, dtype, chunks, compression,
+                            compression_opts, shuffle, fletcher32, maxshape)
 
                 if maxshape is not None:
                     maxshape = tuple(x if x is not None else h5s.UNLIMITED for x in maxshape)
@@ -755,7 +749,12 @@ class Dataset(HLObject):
                     self.id.write(h5s.ALL, h5s.ALL, data)
 
             self._attrs = AttributeManager(self)
-            self._plist = self.id.get_create_plist()
+            plist = self.id.get_create_plist()
+            self._filters = uhl.get_filters(plist)
+            if plist.get_layout() == h5d.CHUNKED:
+                self._chunks = plist.get_chunk()
+            else:
+                self._chunks = None
 
     def extend(self, shape):
         """ Deprecated.  Use resize() instead. """
diff --git a/h5py/tests/test_highlevel.py b/h5py/tests/test_highlevel.py
index 986543c..78c0262 100644
--- a/h5py/tests/test_highlevel.py
+++ b/h5py/tests/test_highlevel.py
@@ -236,6 +236,11 @@ class TestDataset(HDF5TestCase):
                 elif value is True:
                     self.assert_(getattr(hdf, name) is not None,
                       "True kwd ignored: %s" % name)
+                elif name == 'compression':
+                    cname = getattr(hdf,name)
+                    cval = getattr(hdf, 'compression_opts')
+                    self.assertEqual(cname, 'gzip')
+                    self.assertEqual(cval,value)
                 else:
                     self.assertEqual(getattr(hdf, name), value,
                       "kwd mismatch: %s: %s %s" % (name, getattr(hdf, name), value))
diff --git a/h5py/utils_hl.py b/h5py/utils_hl.py
index a2ffe46..c5cfe23 100644
--- a/h5py/utils_hl.py
+++ b/h5py/utils_hl.py
@@ -3,7 +3,7 @@
     Utility functions for high-level modules.
 """
 from __future__ import with_statement
-from h5py import h5s
+from h5py import h5s, h5z, h5p, h5d
 
 from posixpath import basename, normpath
 import numpy
@@ -20,6 +20,124 @@ def hbasename(name):
         bname = '/'
     return bname
 
+COMP_FILTERS = {'gzip': h5z.FILTER_DEFLATE,
+                'szip': h5z.FILTER_SZIP,
+                'lzf': h5z.FILTER_LZF }
+
+def generate_dcpl(shape, dtype, chunks, compression, compression_opts,
+                  shuffle, fletcher32, maxshape):
+    """ Generate a dataset creation property list.
+
+        Checks range and correctness of each argument.  Does not check
+        for disallowed arguments.
+
+        chunks:         None or tuple with len == len(shape)
+        compression:    None or in 'gzip', 'lzf', 'szip'
+        compression_opts: None or <arbitrary>
+        shuffle:        T/F
+        fletcher32:     T/F
+        maxshape:       None or tuple with len == len(shape)
+    """
+
+    # Validate and normalize arguments
+
+    shuffle = bool(shuffle)
+    fletcher32 = bool(fletcher32)
+
+    def rq_tuple(tpl, name):
+        if tpl not in (None, True):
+            try:
+                tpl = tuple(tpl)
+                if len(tpl) != len(shape):
+                    raise ValueError('"%s" must have same rank as dataset shape' % name)
+            except TypeError:
+                raise TypeError('"%s" argument must be None or a sequence object' % name) 
+ 
+    rq_tuple(chunks, 'chunks')
+    rq_tuple(maxshape, 'maxshape')
+
+    if compression is not None:
+        if compression not in COMP_FILTERS:
+            raise ValueError("Compression method must be one of %s" % ", ".join(COMP_FILTERS))
+        if compression == 'gzip':
+            if compression_opts is None:
+                gzip_level = 4
+            elif compression_opts in range(10):
+                gzip_level = compression_opts
+            else:
+                raise ValueError("GZIP setting must be an integer from 0-9, not %r" % compression_opts)
+        elif compression == 'lzf':
+            if compression_opts is not None:
+                raise ValueError("LZF compression filter accepts no options")
+        elif compression == 'szip':
+            if compression_opts is None:
+                compression_opts = (h5z.SZIP_NN_OPTION_MASK, 8)
+            else:
+                err = "SZIP options must be a 2-tuple ('ec'|'nn', even integer 0-32)"
+                try:
+                    szmethod, szpix = compression_opts
+                except TypeError:
+                    raise TypeError(err)
+                if szmethod not in ('ec', 'nn'):
+                    raise ValueError(err)
+                if not (0<szpix<32 and szpix%2 != 0):
+                    raise ValueError(err)
+
+    # End argument validation
+
+    if (chunks is True) or \
+    (chunks is None and any((shuffle, fletcher32, compression, maxshape))):
+        if shape == ():
+            raise TypeError("Compression cannot be used with scalar datasets")
+        chunks = guess_chunk(shape, dtype.itemsize)
+        
+    if maxshape is True:
+        maxshape = (None,)*len(shape)
+
+    plist = h5p.create(h5p.DATASET_CREATE)
+    if chunks is not None:
+        plist.set_chunk(chunks)
+        plist.set_fill_time(h5d.FILL_TIME_ALLOC)
+
+    if shuffle:
+        plist.set_shuffle()
+
+    if compression == 'gzip':
+        plist.set_deflate(gzip_level)
+    elif compression == 'lzf':
+        plist.set_filter(h5z.FILTER_LZF, h5z.FLAG_OPTIONAL)
+    elif compression == 'szip':
+        opts = {'ec': h5z.SZIP_EC_OPTION_MASK, 'nn': h5z.SZIP_NN_OPTION_MASK}
+        plist.set_szip(opts[szmethod], szpix)
+
+    if fletcher32:
+        plist.set_fletcher32()
+
+    return plist
+
+def get_filters(plist):
+    """ Extract a dictionary of active filters from a DCPL, along with
+    their settings
+    """
+
+    filters = {h5z.FILTER_DEFLATE: 'gzip', h5z.FILTER_SZIP: 'szip',
+               h5z.FILTER_SHUFFLE: 'shuffle', h5z.FILTER_FLETCHER32: 'fletcher32',
+               h5z.FILTER_LZF: 'lzf'}
+
+    pipeline = {}
+
+    nfilters = plist.get_nfilters()
+
+    for i in range(nfilters):
+        code, flags, vals, desc = plist.get_filter(i)
+        if len(vals) == 0:
+            vals = None
+        elif len(vals) == 1:
+            vals = vals[0]
+        pipeline[filters.get(code, str(code))] = vals
+
+    return pipeline
+
 def guess_chunk(shape, typesize):
     """ Guess an appropriate chunk layout for a dataset, given its shape and
         the size of each element in bytes.  Will allocate chunks only as large

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/h5py.git