[SCM] WebKit Debian packaging branch, debian/experimental, updated. upstream/1.3.3-9427-gc2be6fc

Wed Dec 22 11:46:03 UTC 2010

The following commit has been merged in the debian/experimental branch:
commit 22e8c887a3f8bed125f2e875afc1ba8dd7fad98f
Author: ap at apple.com <ap at apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Date:   Fri Aug 6 05:36:28 2010 +0000

            Reviewed by Darin Adler.
    
            https://bugs.webkit.org/show_bug.cgi?id=43554
            Way too many encoding aliases are treated as valid
    
            <rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.
    
            <rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com
    
            Test: http/tests/misc/bad-charset-alias.html
    
            * loader/TextResourceDecoder.cpp: (WebCore::TextResourceDecoder::checkForCSSCharset):
            Fix encoding name length computation. Previously, a trailing quote was ignored by
            TextEncodingRegistry.
    
            * platform/text/TextCodecICU.cpp: (WebCore::TextCodecICU::registerExtendedEncodingNames):
            Added dashes to alias names that didn't have them. Added aliases prompted by regression tests.
    
            * platform/text/TextCodecLatin1.cpp: (WebCore::TextCodecLatin1::registerEncodingNames):
            Don't register 8859-1, other browsers do not support this encoding name.
    
            * platform/text/TextEncoding.cpp: (WebCore::Latin1Encoding):
            "Latin-1" is not a real encoding name, it's not known to Firefox or IE.
    
            * platform/text/TextEncodingRegistry.cpp:
            (WebCore::TextEncodingNameHash::equal): Changed to no longer ignore non-alphanumeric characters.
            There is a good chance that we'll be missing support for some necessary alias names, but other
            browsers don't ignore any characters when matching names.
            (WebCore::TextEncodingNameHash::hash): Ditto.
            (WebCore::checkExistingName): Re-formatted a line.
            (WebCore::isUndesiredAlias): Added a filter to reject "8859_1" and any names containing commas.
            (WebCore::addToTextEncodingNameMap): Used it.
            (WebCore::atomicCanonicalTextEncodingName): Changed to no longer ignore non-alphanumeric characters.
    
    
    
    git-svn-id: http://svn.webkit.org/repository/webkit/trunk@64817 268f45cc-cd09-0410-ab3c-d52691b4dbfc

diff --git a/LayoutTests/ChangeLog b/LayoutTests/ChangeLog
index be273d7..ed1c089 100644
--- a/LayoutTests/ChangeLog
+++ b/LayoutTests/ChangeLog
@@ -1,3 +1,23 @@
+2010-08-05  Alexey Proskuryakov  <ap at apple.com>
+
+        Reviewed by Darin Adler.
+
+        https://bugs.webkit.org/show_bug.cgi?id=43554
+        Way too many encoding aliases are treated as valid
+
+        <rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.
+
+        <rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com
+
+        * fast/encoding/char-encoding-expected.txt:
+        * fast/encoding/char-encoding.html:
+        Use a correct name for GB_2312-80. At least Firefox doesn't know GB-2312-80.
+
+        * http/tests/misc/bad-charset-alias-expected.txt: Added.
+        * http/tests/misc/bad-charset-alias.html: Added.
+        * http/tests/misc/resources/bad-charset-alias.php: Added.
+        Check that certain encoding names are unknown. Both Firefox and IE don't know these.
+
 2010-08-05  W. James MacLean  <wjmaclean at chromium.org>
 
         Reviewed by Nikolas Zimmermann.
diff --git a/LayoutTests/fast/encoding/char-encoding-expected.txt b/LayoutTests/fast/encoding/char-encoding-expected.txt
index ab1542a..988f73b 100644
--- a/LayoutTests/fast/encoding/char-encoding-expected.txt
+++ b/LayoutTests/fast/encoding/char-encoding-expected.txt
@@ -6,11 +6,11 @@ On success, you will see a series of "PASS" messages, followed by "TEST COMPLETE
 PASS encode('UTF-8', 'U+00A0') is '%C2%A0'
 PASS encode('GBK', 'U+00A5') is '%A3%A4'
 PASS encode('gb2312', 'U+00A5') is '%A3%A4'
-PASS encode('GB-2312-80', 'U+00A5') is '%A3%A4'
+PASS encode('GB_2312-80', 'U+00A5') is '%A3%A4'
 PASS encode('EUC-CN', 'U+00A5') is '%A3%A4'
 PASS encode('GBK', 'U+20AC') is '%80'
 PASS encode('gb2312', 'U+20AC') is '%80'
-PASS encode('GB-2312-80', 'U+20AC') is '%80'
+PASS encode('GB_2312-80', 'U+20AC') is '%80'
 PASS encode('EUC-CN', 'U+20AC') is '%80'
 PASS encode('GBK', 'U+01F9') is '%A8%BF'
 PASS encode('GBK', 'U+1E3F') is '%A8%BC'
diff --git a/LayoutTests/fast/encoding/char-encoding.html b/LayoutTests/fast/encoding/char-encoding.html
index be1e05b..569cd5e 100644
--- a/LayoutTests/fast/encoding/char-encoding.html
+++ b/LayoutTests/fast/encoding/char-encoding.html
@@ -25,12 +25,12 @@ testEncode("UTF-8", "U+00A0", "%C2%A0");
 //Yen symbol in gbk
 testEncode('GBK', 'U+00A5', '%A3%A4');
 testEncode('gb2312', 'U+00A5', '%A3%A4');
-testEncode('GB-2312-80', 'U+00A5', '%A3%A4');
+testEncode('GB_2312-80', 'U+00A5', '%A3%A4');
 testEncode('EUC-CN', 'U+00A5', '%A3%A4');
 //Euro symbol in gbk
 testEncode('GBK', 'U+20AC', '%80');
 testEncode('gb2312', 'U+20AC', '%80');
-testEncode('GB-2312-80', 'U+20AC', '%80');
+testEncode('GB_2312-80', 'U+20AC', '%80');
 testEncode('EUC-CN', 'U+20AC', '%80');
 //Misc symbols from TEC specific GBK translation 
 testEncode('GBK', 'U+01F9', '%A8%BF');
diff --git a/LayoutTests/http/tests/misc/bad-charset-alias-expected.txt b/LayoutTests/http/tests/misc/bad-charset-alias-expected.txt
new file mode 100644
index 0000000..88c213f
--- /dev/null
+++ b/LayoutTests/http/tests/misc/bad-charset-alias-expected.txt
@@ -0,0 +1,5 @@
+Test that iso-8859-1 aliases that aren't known to Firefox and IE aren't supported (we should fall back to parent frame charset).
+
+SUCCESS
+
+
diff --git a/LayoutTests/http/tests/misc/bad-charset-alias.html b/LayoutTests/http/tests/misc/bad-charset-alias.html
new file mode 100644
index 0000000..a37dcaf
--- /dev/null
+++ b/LayoutTests/http/tests/misc/bad-charset-alias.html
@@ -0,0 +1,53 @@
+<meta charset="koi8-r">
+<body>
+<p>Test that iso-8859-1 aliases that aren't known to Firefox and IE aren't supported
+(we should fall back to parent frame charset).</p>
+<p id=result>Testing...</p>
+<script>
+if (window.layoutTestController) {
+    layoutTestController.waitUntilDone();
+    layoutTestController.dumpAsText();
+}
+
+var aliases = [
+    "foobar", // Definitely unknown, verify that charset inheritance works.
+    "8859_1", // <rdar://problem/7859068>
+    "ISO8859_1", // <rdar://problem/7863399>
+    "8859-1", // WebKit used to specifically add this alias name, but other browsers don't support it.
+    "ISO_2022,locale=ja,version=0", // We never want versioned alias names, other browsers don't support these.
+    "utf 8", // Other weird variations
+    "utf_8",
+    "8859 1",
+    "8859*1",
+    "8859:1",
+    "88591",
+    "ISO_88591",
+    "ISO-88591",
+    "ISO-88-59-1",
+    "latin-1" // Yes, it's "latin1" without a dash - neither IE nor Firefox support "latin-1".
+];
+    
+for (i = 0; i < aliases.length; ++i) {
+    var ifr = document.createElement("iframe");
+    ifr.setAttribute("src", "resources/bad-charset-alias.php?charset=" + aliases[i]);
+    document.body.appendChild(ifr);
+}
+
+var framesLeft = aliases.length;
+function frameLoaded()
+{
+    if (!--framesLeft) {
+        var failures = "";
+        for (i = 0; i < aliases.length; ++i) {
+            var ifr = frames[i];
+            if (ifr.document.getElementById("test").innerHTML != "SUóóåSS")
+                failures += ifr.document.getElementById("charset").innerHTML + " ";
+        }
+        document.getElementById("result").innerHTML = !failures.length ? "SUCCESS" : 
+            ("FAIL: " + failures);
+        if (window.layoutTestController)
+            layoutTestController.notifyDone();
+    }
+}
+</script>
+</body>
diff --git a/LayoutTests/http/tests/misc/resources/bad-charset-alias.php b/LayoutTests/http/tests/misc/resources/bad-charset-alias.php
new file mode 100644
index 0000000..6f08908
--- /dev/null
+++ b/LayoutTests/http/tests/misc/resources/bad-charset-alias.php
@@ -0,0 +1,7 @@
+<?php
+echo '<meta charset="' . $_GET['charset'] . '">';
+echo '<body onload="top.frameLoaded()">';
+echo '<p id=charset>' . $_GET['charset'] . '</p>';
+echo '<p id=test>SUóóåSS</p>'; // "óóå" are Cyrillic characters that look like "CCE".
+echo '</body>';
+?>
diff --git a/WebCore/ChangeLog b/WebCore/ChangeLog
index 66ea8bd..46a08ff 100644
--- a/WebCore/ChangeLog
+++ b/WebCore/ChangeLog
@@ -1,3 +1,39 @@
+2010-08-05  Alexey Proskuryakov  <ap at apple.com>
+
+        Reviewed by Darin Adler.
+
+        https://bugs.webkit.org/show_bug.cgi?id=43554
+        Way too many encoding aliases are treated as valid
+
+        <rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.
+
+        <rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com
+
+        Test: http/tests/misc/bad-charset-alias.html
+
+        * loader/TextResourceDecoder.cpp: (WebCore::TextResourceDecoder::checkForCSSCharset):
+        Fix encoding name length computation. Previously, a trailing quote was ignored by
+        TextEncodingRegistry.
+        
+        * platform/text/TextCodecICU.cpp: (WebCore::TextCodecICU::registerExtendedEncodingNames):
+        Added dashes to alias names that didn't have them. Added aliases prompted by regression tests.
+
+        * platform/text/TextCodecLatin1.cpp: (WebCore::TextCodecLatin1::registerEncodingNames):
+        Don't register 8859-1, other browsers do not support this encoding name.
+
+        * platform/text/TextEncoding.cpp: (WebCore::Latin1Encoding):
+        "Latin-1" is not a real encoding name, it's not known to Firefox or IE.
+
+        * platform/text/TextEncodingRegistry.cpp:
+        (WebCore::TextEncodingNameHash::equal): Changed to no longer ignore non-alphanumeric characters.
+        There is a good chance that we'll be missing support for some necessary alias names, but other
+        browsers don't ignore any characters when matching names.
+        (WebCore::TextEncodingNameHash::hash): Ditto.
+        (WebCore::checkExistingName): Re-formatted a line.
+        (WebCore::isUndesiredAlias): Added a filter to reject "8859_1" and any names containing commas.
+        (WebCore::addToTextEncodingNameMap): Used it.
+        (WebCore::atomicCanonicalTextEncodingName): Changed to no longer ignore non-alphanumeric characters.
+
 2010-08-05  Simon Hausmann  <simon.hausmann at nokia.com>
 
         Reviewed by Laszlo Gombos.
diff --git a/WebCore/loader/TextResourceDecoder.cpp b/WebCore/loader/TextResourceDecoder.cpp
index 6d43d77..4002b38 100644
--- a/WebCore/loader/TextResourceDecoder.cpp
+++ b/WebCore/loader/TextResourceDecoder.cpp
@@ -488,7 +488,7 @@ bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool&
                 if (pos == dataEnd)
                     return false;
 
-                int encodingNameLength = pos - dataStart + 1;
+                int encodingNameLength = pos - dataStart;
                 
                 ++pos;
                 if (!skipWhitespace(pos, dataEnd))
diff --git a/WebCore/platform/text/TextCodecICU.cpp b/WebCore/platform/text/TextCodecICU.cpp
index 56a4393..07bc4c8 100644
--- a/WebCore/platform/text/TextCodecICU.cpp
+++ b/WebCore/platform/text/TextCodecICU.cpp
@@ -70,10 +70,6 @@ void TextCodecICU::registerBaseCodecs(TextCodecRegistrar registrar)
     registrar("UTF-8", newTextCodecICU, 0);
 }
 
-// FIXME: Registering all the encodings we get from ucnv_getAvailableName
-// includes encodings we don't want or need. For example, all
-// the encodings with commas and version numbers.
-
 void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
 {
     // We register Hebrew with logical ordering using a separate name.
@@ -136,41 +132,60 @@ void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar
     // table in WebKit on Macintosh that don't seem to be present in ICU.
     // Perhaps we can prove these are not used on the web and remove them.
     // Or perhaps we can get them added to ICU.
-    registrar("xmacroman", "macintosh");
-    registrar("xmacukrainian", "x-mac-cyrillic");
-    registrar("cnbig5", "Big5");
-    registrar("xxbig5", "Big5");
-    registrar("cngb", "GBK");
+    registrar("x-mac-roman", "macintosh");
+    registrar("x-mac-ukrainian", "x-mac-cyrillic");
+    registrar("cn-big5", "Big5");
+    registrar("x-x-big5", "Big5");
+    registrar("cn-gb", "GBK");
     registrar("csgb231280", "GBK");
-    registrar("xeuccn", "GBK");
-    registrar("xgbk", "GBK");
-    registrar("csISO88598I", "ISO_8859-8-I");
+    registrar("x-euc-cn", "GBK");
+    registrar("x-gbk", "GBK");
+    registrar("csISO88598I", "ISO-8859-8-I");
     registrar("koi", "KOI8-R");
     registrar("logical", "ISO-8859-8-I");
     registrar("unicode11utf8", "UTF-8");
     registrar("unicode20utf8", "UTF-8");
-    registrar("xunicode20utf8", "UTF-8");
+    registrar("x-unicode20utf8", "UTF-8");
     registrar("visual", "ISO-8859-8");
     registrar("winarabic", "windows-1256");
     registrar("winbaltic", "windows-1257");
     registrar("wincyrillic", "windows-1251");
-    registrar("iso885911", "windows-874");
-    registrar("dos874", "windows-874");
+    registrar("iso-8859-11", "windows-874");
+    registrar("iso8859-11", "windows-874");
+    registrar("dos-874", "windows-874");
     registrar("wingreek", "windows-1253");
     registrar("winhebrew", "windows-1255");
     registrar("winlatin2", "windows-1250");
     registrar("winturkish", "windows-1254");
     registrar("winvietnamese", "windows-1258");
-    registrar("xcp1250", "windows-1250");
-    registrar("xcp1251", "windows-1251");
-    registrar("xeuc", "EUC-JP");
-    registrar("xwindows949", "windows-949");
-    registrar("xuhc", "windows-949");
+    registrar("x-cp1250", "windows-1250");
+    registrar("x-cp1251", "windows-1251");
+    registrar("x-euc", "EUC-JP");
+    registrar("x-windows-949", "windows-949");
+    registrar("x-uhc", "windows-949");
+    registrar("utf8", "UTF-8");
 
     // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
     // They are not present in ICU 3.2.
-    registrar("dos720", "cp864");
+    registrar("dos-720", "cp864");
     registrar("jis7", "ISO-2022-JP");
+
+    // Alternative spelling of ISO encoding names.
+    registrar("ISO8859-1", "ISO-8859-1");
+    registrar("ISO8859-2", "ISO-8859-2");
+    registrar("ISO8859-3", "ISO-8859-3");
+    registrar("ISO8859-4", "ISO-8859-4");
+    registrar("ISO8859-5", "ISO-8859-5");
+    registrar("ISO8859-6", "ISO-8859-6");
+    registrar("ISO8859-7", "ISO-8859-7");
+    registrar("ISO8859-8", "ISO-8859-8");
+    registrar("ISO8859-8-I", "ISO-8859-8-I");
+    registrar("ISO8859-9", "ISO-8859-9");
+    registrar("ISO8859-10", "ISO-8859-10");
+    registrar("ISO8859-13", "ISO-8859-13");
+    registrar("ISO8859-14", "ISO-8859-14");
+    registrar("ISO8859-15", "ISO-8859-15");
+    registrar("ISO8859-16", "ISO-8859-16");
 }
 
 void TextCodecICU::registerExtendedCodecs(TextCodecRegistrar registrar)
diff --git a/WebCore/platform/text/TextCodecLatin1.cpp b/WebCore/platform/text/TextCodecLatin1.cpp
index 55b20e8..1e9385d 100644
--- a/WebCore/platform/text/TextCodecLatin1.cpp
+++ b/WebCore/platform/text/TextCodecLatin1.cpp
@@ -79,7 +79,6 @@ void TextCodecLatin1::registerEncodingNames(EncodingNameRegistrar registrar)
     registrar("ibm-1252", "windows-1252");
     registrar("ibm-1252_P100-2000", "windows-1252");
 
-    registrar("8859-1", "ISO-8859-1");
     registrar("CP819", "ISO-8859-1");
     registrar("IBM819", "ISO-8859-1");
     registrar("csISOLatin1", "ISO-8859-1");
diff --git a/WebCore/platform/text/TextEncoding.cpp b/WebCore/platform/text/TextEncoding.cpp
index 0a997a2..29ae170 100644
--- a/WebCore/platform/text/TextEncoding.cpp
+++ b/WebCore/platform/text/TextEncoding.cpp
@@ -248,7 +248,7 @@ const TextEncoding& ASCIIEncoding()
 
 const TextEncoding& Latin1Encoding()
 {
-    static TextEncoding globalLatin1Encoding("Latin-1");
+    static TextEncoding globalLatin1Encoding("latin1");
     return globalLatin1Encoding;
 }
 
diff --git a/WebCore/platform/text/TextEncodingRegistry.cpp b/WebCore/platform/text/TextEncodingRegistry.cpp
index 6ecc36f..40fcdc5 100644
--- a/WebCore/platform/text/TextEncodingRegistry.cpp
+++ b/WebCore/platform/text/TextEncodingRegistry.cpp
@@ -61,10 +61,7 @@ namespace WebCore {
 
 const size_t maxEncodingNameLength = 63;
 
-// Hash for all-ASCII strings that does case folding and skips any characters
-// that are not alphanumeric. If passed any non-ASCII characters, depends on
-// the behavior of isalnum -- if that returns false as it does on OS X, then
-// it will properly skip those characters too.
+// Hash for all-ASCII strings that does case folding.
 struct TextEncodingNameHash {
 
     static bool equal(const char* s1, const char* s2)
@@ -72,12 +69,8 @@ struct TextEncodingNameHash {
         char c1;
         char c2;
         do {
-            do
-                c1 = *s1++;
-            while (c1 && !isASCIIAlphanumeric(c1));
-            do
-                c2 = *s2++;
-            while (c2 && !isASCIIAlphanumeric(c2));
+            c1 = *s1++;
+            c2 = *s2++;
             if (toASCIILower(c1) != toASCIILower(c2))
                 return false;
         } while (c1 && c2);
@@ -91,16 +84,13 @@ struct TextEncodingNameHash {
     {
         unsigned h = WTF::stringHashingStartValue;
         for (;;) {
-            char c;
-            do {
-                c = *s++;
-                if (!c) {
-                    h += (h << 3);
-                    h ^= (h >> 11);
-                    h += (h << 15);
-                    return h;
-                }
-            } while (!isASCIIAlphanumeric(c));
+            char c = *s++;
+            if (!c) {
+                h += (h << 3);
+                h ^= (h >> 11);
+                h += (h << 15);
+                return h;
+            }
             h += toASCIILower(c);
             h += (h << 10); 
             h ^= (h >> 6); 
@@ -154,15 +144,30 @@ static void checkExistingName(const char* alias, const char* atomicName)
             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
             && strcasecmp(atomicName, "iso-8859-8") == 0)
         return;
-    LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s",
-        alias, oldAtomicName, atomicName);
+    LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
 }
 
 #endif
 
+static bool isUndesiredAlias(const char* alias)
+{
+    // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
+    for (const char* p = alias; *p; ++p) {
+        if (*p == ',')
+            return true;
+    }
+    // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
+    // problem, see bug 43554.
+    if (0 == strcmp(alias, "8859_1"))
+        return true;
+    return false;
+}
+
 static void addToTextEncodingNameMap(const char* alias, const char* name)
 {
     ASSERT(strlen(alias) <= maxEncodingNameLength);
+    if (isUndesiredAlias(alias))
+        return;
     const char* atomicName = textEncodingNameMap->get(name);
     ASSERT(strcmp(alias, name) == 0 || atomicName);
     if (!atomicName)
@@ -300,11 +305,9 @@ const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t leng
     size_t j = 0;
     for (size_t i = 0; i < length; ++i) {
         UChar c = characters[i];
-        if (isASCIIAlphanumeric(c)) {
-            if (j == maxEncodingNameLength)
-                return 0;
-            buffer[j++] = c;
-        }
+        if (j == maxEncodingNameLength)
+            return 0;
+        buffer[j++] = c;
     }
     buffer[j] = 0;
     return atomicCanonicalTextEncodingName(buffer);

-- 
WebKit Debian packaging