[PATCH 2/2] deb822: Better support for non-ascii values

John Wright john.wright at hp.com
Fri Jan 29 01:41:03 UTC 2010


This patch changes Deb822Dict to give back only unicode values.  That
class (and the Deb822 subclass) now takes an encoding argument, which
defaults to "utf-8", and specifies how incoming strings are to be
interpreted.  Likewise, the dump method takes an optional encoding
argument to specify how to encode the unicode objects back to byte
strings before writing out to a file (and it defaults to the encoding
the object was initialized with).
---
 debian_bundle/deb822.py |   50 ++++++++++++++++++++++++++++++++++++++--------
 tests/test_deb822.py    |    2 +-
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/debian_bundle/deb822.py b/debian_bundle/deb822.py
index b3acb69..6fed5ea 100644
--- a/debian_bundle/deb822.py
+++ b/debian_bundle/deb822.py
@@ -127,10 +127,12 @@ class Deb822Dict(object, UserDict.DictMixin):
 
     # See the end of the file for the definition of _strI
 
-    def __init__(self, _dict=None, _parsed=None, _fields=None):
+    def __init__(self, _dict=None, _parsed=None, _fields=None,
+                 encoding="utf-8"):
         self.__dict = {}
         self.__keys = OrderedSet()
         self.__parsed = None
+        self.encoding = encoding
 
         if _dict is not None:
             # _dict may be a dict or a list of two-sized tuples
@@ -165,13 +167,18 @@ class Deb822Dict(object, UserDict.DictMixin):
     def __getitem__(self, key):
         key = _strI(key)
         try:
-            return self.__dict[key]
+            value = self.__dict[key]
         except KeyError:
             if self.__parsed is not None and key in self.__keys:
-                return self.__parsed[key]
+                value = self.__parsed[key]
             else:
                 raise
 
+        if isinstance(value, str):
+            # Always return unicode objects instead of strings
+            value = value.decode(self.encoding)
+        return value
+
     def __delitem__(self, key):
         key = _strI(key)
         self.__keys.remove(key)
@@ -217,7 +224,8 @@ class Deb822Dict(object, UserDict.DictMixin):
 
 class Deb822(Deb822Dict):
 
-    def __init__(self, sequence=None, fields=None, _parsed=None):
+    def __init__(self, sequence=None, fields=None, _parsed=None,
+                 encoding="utf-8"):
         """Create a new Deb822 instance.
 
         :param sequence: a string, or any any object that returns a line of
@@ -228,6 +236,10 @@ class Deb822(Deb822Dict):
             should be parsed (the rest will be discarded).
 
         :param _parsed: internal parameter.
+
+        :param encoding: When parsing strings, interpret them in this encoding.
+            (All values are given back as unicode objects, so an encoding is
+            necessary in order to properly interpet the strings.)
         """
 
         if hasattr(sequence, 'items'):
@@ -235,7 +247,8 @@ class Deb822(Deb822Dict):
             sequence = None
         else:
             _dict = None
-        Deb822Dict.__init__(self, _dict=_dict, _parsed=_parsed, _fields=fields)
+        Deb822Dict.__init__(self, _dict=_dict, _parsed=_parsed, _fields=fields,
+                            encoding=encoding)
 
         if sequence is not None:
             try:
@@ -305,6 +318,8 @@ class Deb822(Deb822Dict):
         curkey = None
         content = ""
         for line in self.gpg_stripped_paragraph(sequence):
+            if isinstance(line, str):
+                line = line.decode(self.encoding)
             m = single.match(line)
             if m:
                 if curkey:
@@ -344,6 +359,9 @@ class Deb822(Deb822Dict):
     def __str__(self):
         return self.dump()
 
+    def __unicode__(self):
+        return self.dump()
+
     # __repr__ is handled by Deb822Dict
 
     def get_as_string(self, key):
@@ -355,10 +373,15 @@ class Deb822(Deb822Dict):
         """
         return unicode(self[key])
 
-    def dump(self, fd=None):
+    def dump(self, fd=None, encoding=None):
         """Dump the the contents in the original format
 
-        If fd is None, return a string.
+        If fd is None, return a unicode object.
+
+        If fd is not None, attempt to encode the output to the encoding the
+        object was initialized with, or the value of the encoding argument if
+        it is not None.  This will raise UnicodeEncodeError if the encoding
+        can't support all the characters in the Deb822Dict values.
         """
 
         if fd is None:
@@ -367,15 +390,24 @@ class Deb822(Deb822Dict):
         else:
             return_string = False
 
+        if encoding is None:
+            # Use the encoding we've been using to decode strings with if none
+            # was explicitly specified
+            encoding = self.encoding
+
         for key in self.iterkeys():
             value = self.get_as_string(key)
             if not value or value[0] == '\n':
                 # Avoid trailing whitespace after "Field:" if it's on its own
                 # line or the value is empty
                 # XXX Uh, really print value if value == '\n'?
-                fd.write('%s:%s\n' % (key, value))
+                entry = '%s:%s\n' % (key, value)
+            else:
+                entry = '%s: %s\n' % (key, value)
+            if not return_string:
+                fd.write(entry.encode(encoding))
             else:
-                fd.write('%s: %s\n' % (key, value))
+                fd.write(entry)
         if return_string:
             return fd.getvalue()
 
diff --git a/tests/test_deb822.py b/tests/test_deb822.py
index 0bde032..4b10838 100755
--- a/tests/test_deb822.py
+++ b/tests/test_deb822.py
@@ -146,7 +146,7 @@ CcYAoOLYDF5G1h3oR1iDNyeCI6hRW03S
     ]
 
 
-CHANGES_FILE = '''\
+CHANGES_FILE = u'''\
 Format: 1.7
 Date: Fri, 28 Dec 2007 17:08:48 +0100
 Source: bzr-gtk
-- 
1.6.6




More information about the pkg-python-debian-discuss mailing list