[SCM] WebKit Debian packaging branch, debian/experimental, updated. upstream/1.3.3-9427-gc2be6fc
ap at apple.com
ap at apple.com
Wed Dec 22 11:46:03 UTC 2010
The following commit has been merged in the debian/experimental branch:
commit 22e8c887a3f8bed125f2e875afc1ba8dd7fad98f
Author: ap at apple.com <ap at apple.com@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Date: Fri Aug 6 05:36:28 2010 +0000
Reviewed by Darin Adler.
https://bugs.webkit.org/show_bug.cgi?id=43554
Way too many encoding aliases are treated as valid
<rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.
<rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com
Test: http/tests/misc/bad-charset-alias.html
* loader/TextResourceDecoder.cpp: (WebCore::TextResourceDecoder::checkForCSSCharset):
Fix encoding name length computation. Previously, a trailing quote was ignored by
TextEncodingRegistry.
* platform/text/TextCodecICU.cpp: (WebCore::TextCodecICU::registerExtendedEncodingNames):
Added dashes to alias names that didn't have them. Added aliases prompted by regression tests.
* platform/text/TextCodecLatin1.cpp: (WebCore::TextCodecLatin1::registerEncodingNames):
Don't register 8859-1, other browsers do not support this encoding name.
* platform/text/TextEncoding.cpp: (WebCore::Latin1Encoding):
"Latin-1" is not a real encoding name, it's not known to Firefox or IE.
* platform/text/TextEncodingRegistry.cpp:
(WebCore::TextEncodingNameHash::equal): Changed to no longer ignore non-alphanumeric characters.
There is a good chance that we'll be missing support for some necessary alias names, but other
browsers don't ignore any characters when matching names.
(WebCore::TextEncodingNameHash::hash): Ditto.
(WebCore::checkExistingName): Re-formatted a line.
(WebCore::isUndesiredAlias): Added a filter to reject "8859_1" and any names containing commas.
(WebCore::addToTextEncodingNameMap): Used it.
(WebCore::atomicCanonicalTextEncodingName): Changed to no longer ignore non-alphanumeric characters.
git-svn-id: http://svn.webkit.org/repository/webkit/trunk@64817 268f45cc-cd09-0410-ab3c-d52691b4dbfc
diff --git a/LayoutTests/ChangeLog b/LayoutTests/ChangeLog
index be273d7..ed1c089 100644
--- a/LayoutTests/ChangeLog
+++ b/LayoutTests/ChangeLog
@@ -1,3 +1,23 @@
+2010-08-05 Alexey Proskuryakov <ap at apple.com>
+
+ Reviewed by Darin Adler.
+
+ https://bugs.webkit.org/show_bug.cgi?id=43554
+ Way too many encoding aliases are treated as valid
+
+ <rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.
+
+ <rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com
+
+ * fast/encoding/char-encoding-expected.txt:
+ * fast/encoding/char-encoding.html:
+ Use a correct name for GB_2312-80. At least Firefox doesn't know GB-2312-80.
+
+ * http/tests/misc/bad-charset-alias-expected.txt: Added.
+ * http/tests/misc/bad-charset-alias.html: Added.
+ * http/tests/misc/resources/bad-charset-alias.php: Added.
+ Check that certain encoding names are unknown. Both Firefox and IE don't know these.
+
2010-08-05 W. James MacLean <wjmaclean at chromium.org>
Reviewed by Nikolas Zimmermann.
diff --git a/LayoutTests/fast/encoding/char-encoding-expected.txt b/LayoutTests/fast/encoding/char-encoding-expected.txt
index ab1542a..988f73b 100644
--- a/LayoutTests/fast/encoding/char-encoding-expected.txt
+++ b/LayoutTests/fast/encoding/char-encoding-expected.txt
@@ -6,11 +6,11 @@ On success, you will see a series of "PASS" messages, followed by "TEST COMPLETE
PASS encode('UTF-8', 'U+00A0') is '%C2%A0'
PASS encode('GBK', 'U+00A5') is '%A3%A4'
PASS encode('gb2312', 'U+00A5') is '%A3%A4'
-PASS encode('GB-2312-80', 'U+00A5') is '%A3%A4'
+PASS encode('GB_2312-80', 'U+00A5') is '%A3%A4'
PASS encode('EUC-CN', 'U+00A5') is '%A3%A4'
PASS encode('GBK', 'U+20AC') is '%80'
PASS encode('gb2312', 'U+20AC') is '%80'
-PASS encode('GB-2312-80', 'U+20AC') is '%80'
+PASS encode('GB_2312-80', 'U+20AC') is '%80'
PASS encode('EUC-CN', 'U+20AC') is '%80'
PASS encode('GBK', 'U+01F9') is '%A8%BF'
PASS encode('GBK', 'U+1E3F') is '%A8%BC'
diff --git a/LayoutTests/fast/encoding/char-encoding.html b/LayoutTests/fast/encoding/char-encoding.html
index be1e05b..569cd5e 100644
--- a/LayoutTests/fast/encoding/char-encoding.html
+++ b/LayoutTests/fast/encoding/char-encoding.html
@@ -25,12 +25,12 @@ testEncode("UTF-8", "U+00A0", "%C2%A0");
//Yen symbol in gbk
testEncode('GBK', 'U+00A5', '%A3%A4');
testEncode('gb2312', 'U+00A5', '%A3%A4');
-testEncode('GB-2312-80', 'U+00A5', '%A3%A4');
+testEncode('GB_2312-80', 'U+00A5', '%A3%A4');
testEncode('EUC-CN', 'U+00A5', '%A3%A4');
//Euro symbol in gbk
testEncode('GBK', 'U+20AC', '%80');
testEncode('gb2312', 'U+20AC', '%80');
-testEncode('GB-2312-80', 'U+20AC', '%80');
+testEncode('GB_2312-80', 'U+20AC', '%80');
testEncode('EUC-CN', 'U+20AC', '%80');
//Misc symbols from TEC specific GBK translation
testEncode('GBK', 'U+01F9', '%A8%BF');
diff --git a/LayoutTests/http/tests/misc/bad-charset-alias-expected.txt b/LayoutTests/http/tests/misc/bad-charset-alias-expected.txt
new file mode 100644
index 0000000..88c213f
--- /dev/null
+++ b/LayoutTests/http/tests/misc/bad-charset-alias-expected.txt
@@ -0,0 +1,5 @@
+Test that iso-8859-1 aliases that aren't known to Firefox and IE aren't supported (we should fall back to parent frame charset).
+
+SUCCESS
+
+
diff --git a/LayoutTests/http/tests/misc/bad-charset-alias.html b/LayoutTests/http/tests/misc/bad-charset-alias.html
new file mode 100644
index 0000000..a37dcaf
--- /dev/null
+++ b/LayoutTests/http/tests/misc/bad-charset-alias.html
@@ -0,0 +1,53 @@
+<meta charset="koi8-r">
+<body>
+<p>Test that iso-8859-1 aliases that aren't known to Firefox and IE aren't supported
+(we should fall back to parent frame charset).</p>
+<p id=result>Testing...</p>
+<script>
+if (window.layoutTestController) {
+ layoutTestController.waitUntilDone();
+ layoutTestController.dumpAsText();
+}
+
+var aliases = [
+ "foobar", // Definitely unknown, verify that charset inheritance works.
+ "8859_1", // <rdar://problem/7859068>
+ "ISO8859_1", // <rdar://problem/7863399>
+ "8859-1", // WebKit used to specifically add this alias name, but other browsers don't support it.
+ "ISO_2022,locale=ja,version=0", // We never want versioned alias names, other browsers don't support these.
+ "utf 8", // Other weird variations
+ "utf_8",
+ "8859 1",
+ "8859*1",
+ "8859:1",
+ "88591",
+ "ISO_88591",
+ "ISO-88591",
+ "ISO-88-59-1",
+ "latin-1" // Yes, it's "latin1" without a dash - neither IE nor Firefox support "latin-1".
+];
+
+for (i = 0; i < aliases.length; ++i) {
+ var ifr = document.createElement("iframe");
+ ifr.setAttribute("src", "resources/bad-charset-alias.php?charset=" + aliases[i]);
+ document.body.appendChild(ifr);
+}
+
+var framesLeft = aliases.length;
+function frameLoaded()
+{
+ if (!--framesLeft) {
+ var failures = "";
+ for (i = 0; i < aliases.length; ++i) {
+ var ifr = frames[i];
+ if (ifr.document.getElementById("test").innerHTML != "SUóóåSS")
+ failures += ifr.document.getElementById("charset").innerHTML + " ";
+ }
+ document.getElementById("result").innerHTML = !failures.length ? "SUCCESS" :
+ ("FAIL: " + failures);
+ if (window.layoutTestController)
+ layoutTestController.notifyDone();
+ }
+}
+</script>
+</body>
diff --git a/LayoutTests/http/tests/misc/resources/bad-charset-alias.php b/LayoutTests/http/tests/misc/resources/bad-charset-alias.php
new file mode 100644
index 0000000..6f08908
--- /dev/null
+++ b/LayoutTests/http/tests/misc/resources/bad-charset-alias.php
@@ -0,0 +1,7 @@
+<?php
+echo '<meta charset="' . $_GET['charset'] . '">';
+echo '<body onload="top.frameLoaded()">';
+echo '<p id=charset>' . $_GET['charset'] . '</p>';
+echo '<p id=test>SUóóåSS</p>'; // "óóå" are Cyrillic characters that look like "CCE".
+echo '</body>';
+?>
diff --git a/WebCore/ChangeLog b/WebCore/ChangeLog
index 66ea8bd..46a08ff 100644
--- a/WebCore/ChangeLog
+++ b/WebCore/ChangeLog
@@ -1,3 +1,39 @@
+2010-08-05 Alexey Proskuryakov <ap at apple.com>
+
+ Reviewed by Darin Adler.
+
+ https://bugs.webkit.org/show_bug.cgi?id=43554
+ Way too many encoding aliases are treated as valid
+
+ <rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.
+
+ <rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com
+
+ Test: http/tests/misc/bad-charset-alias.html
+
+ * loader/TextResourceDecoder.cpp: (WebCore::TextResourceDecoder::checkForCSSCharset):
+ Fix encoding name length computation. Previously, a trailing quote was ignored by
+ TextEncodingRegistry.
+
+ * platform/text/TextCodecICU.cpp: (WebCore::TextCodecICU::registerExtendedEncodingNames):
+ Added dashes to alias names that didn't have them. Added aliases prompted by regression tests.
+
+ * platform/text/TextCodecLatin1.cpp: (WebCore::TextCodecLatin1::registerEncodingNames):
+ Don't register 8859-1, other browsers do not support this encoding name.
+
+ * platform/text/TextEncoding.cpp: (WebCore::Latin1Encoding):
+ "Latin-1" is not a real encoding name, it's not known to Firefox or IE.
+
+ * platform/text/TextEncodingRegistry.cpp:
+ (WebCore::TextEncodingNameHash::equal): Changed to no longer ignore non-alphanumeric characters.
+ There is a good chance that we'll be missing support for some necessary alias names, but other
+ browsers don't ignore any characters when matching names.
+ (WebCore::TextEncodingNameHash::hash): Ditto.
+ (WebCore::checkExistingName): Re-formatted a line.
+ (WebCore::isUndesiredAlias): Added a filter to reject "8859_1" and any names containing commas.
+ (WebCore::addToTextEncodingNameMap): Used it.
+ (WebCore::atomicCanonicalTextEncodingName): Changed to no longer ignore non-alphanumeric characters.
+
2010-08-05 Simon Hausmann <simon.hausmann at nokia.com>
Reviewed by Laszlo Gombos.
diff --git a/WebCore/loader/TextResourceDecoder.cpp b/WebCore/loader/TextResourceDecoder.cpp
index 6d43d77..4002b38 100644
--- a/WebCore/loader/TextResourceDecoder.cpp
+++ b/WebCore/loader/TextResourceDecoder.cpp
@@ -488,7 +488,7 @@ bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool&
if (pos == dataEnd)
return false;
- int encodingNameLength = pos - dataStart + 1;
+ int encodingNameLength = pos - dataStart;
++pos;
if (!skipWhitespace(pos, dataEnd))
diff --git a/WebCore/platform/text/TextCodecICU.cpp b/WebCore/platform/text/TextCodecICU.cpp
index 56a4393..07bc4c8 100644
--- a/WebCore/platform/text/TextCodecICU.cpp
+++ b/WebCore/platform/text/TextCodecICU.cpp
@@ -70,10 +70,6 @@ void TextCodecICU::registerBaseCodecs(TextCodecRegistrar registrar)
registrar("UTF-8", newTextCodecICU, 0);
}
-// FIXME: Registering all the encodings we get from ucnv_getAvailableName
-// includes encodings we don't want or need. For example, all
-// the encodings with commas and version numbers.
-
void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
{
// We register Hebrew with logical ordering using a separate name.
@@ -136,41 +132,60 @@ void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar
// table in WebKit on Macintosh that don't seem to be present in ICU.
// Perhaps we can prove these are not used on the web and remove them.
// Or perhaps we can get them added to ICU.
- registrar("xmacroman", "macintosh");
- registrar("xmacukrainian", "x-mac-cyrillic");
- registrar("cnbig5", "Big5");
- registrar("xxbig5", "Big5");
- registrar("cngb", "GBK");
+ registrar("x-mac-roman", "macintosh");
+ registrar("x-mac-ukrainian", "x-mac-cyrillic");
+ registrar("cn-big5", "Big5");
+ registrar("x-x-big5", "Big5");
+ registrar("cn-gb", "GBK");
registrar("csgb231280", "GBK");
- registrar("xeuccn", "GBK");
- registrar("xgbk", "GBK");
- registrar("csISO88598I", "ISO_8859-8-I");
+ registrar("x-euc-cn", "GBK");
+ registrar("x-gbk", "GBK");
+ registrar("csISO88598I", "ISO-8859-8-I");
registrar("koi", "KOI8-R");
registrar("logical", "ISO-8859-8-I");
registrar("unicode11utf8", "UTF-8");
registrar("unicode20utf8", "UTF-8");
- registrar("xunicode20utf8", "UTF-8");
+ registrar("x-unicode20utf8", "UTF-8");
registrar("visual", "ISO-8859-8");
registrar("winarabic", "windows-1256");
registrar("winbaltic", "windows-1257");
registrar("wincyrillic", "windows-1251");
- registrar("iso885911", "windows-874");
- registrar("dos874", "windows-874");
+ registrar("iso-8859-11", "windows-874");
+ registrar("iso8859-11", "windows-874");
+ registrar("dos-874", "windows-874");
registrar("wingreek", "windows-1253");
registrar("winhebrew", "windows-1255");
registrar("winlatin2", "windows-1250");
registrar("winturkish", "windows-1254");
registrar("winvietnamese", "windows-1258");
- registrar("xcp1250", "windows-1250");
- registrar("xcp1251", "windows-1251");
- registrar("xeuc", "EUC-JP");
- registrar("xwindows949", "windows-949");
- registrar("xuhc", "windows-949");
+ registrar("x-cp1250", "windows-1250");
+ registrar("x-cp1251", "windows-1251");
+ registrar("x-euc", "EUC-JP");
+ registrar("x-windows-949", "windows-949");
+ registrar("x-uhc", "windows-949");
+ registrar("utf8", "UTF-8");
// These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
// They are not present in ICU 3.2.
- registrar("dos720", "cp864");
+ registrar("dos-720", "cp864");
registrar("jis7", "ISO-2022-JP");
+
+ // Alternative spelling of ISO encoding names.
+ registrar("ISO8859-1", "ISO-8859-1");
+ registrar("ISO8859-2", "ISO-8859-2");
+ registrar("ISO8859-3", "ISO-8859-3");
+ registrar("ISO8859-4", "ISO-8859-4");
+ registrar("ISO8859-5", "ISO-8859-5");
+ registrar("ISO8859-6", "ISO-8859-6");
+ registrar("ISO8859-7", "ISO-8859-7");
+ registrar("ISO8859-8", "ISO-8859-8");
+ registrar("ISO8859-8-I", "ISO-8859-8-I");
+ registrar("ISO8859-9", "ISO-8859-9");
+ registrar("ISO8859-10", "ISO-8859-10");
+ registrar("ISO8859-13", "ISO-8859-13");
+ registrar("ISO8859-14", "ISO-8859-14");
+ registrar("ISO8859-15", "ISO-8859-15");
+ registrar("ISO8859-16", "ISO-8859-16");
}
void TextCodecICU::registerExtendedCodecs(TextCodecRegistrar registrar)
diff --git a/WebCore/platform/text/TextCodecLatin1.cpp b/WebCore/platform/text/TextCodecLatin1.cpp
index 55b20e8..1e9385d 100644
--- a/WebCore/platform/text/TextCodecLatin1.cpp
+++ b/WebCore/platform/text/TextCodecLatin1.cpp
@@ -79,7 +79,6 @@ void TextCodecLatin1::registerEncodingNames(EncodingNameRegistrar registrar)
registrar("ibm-1252", "windows-1252");
registrar("ibm-1252_P100-2000", "windows-1252");
- registrar("8859-1", "ISO-8859-1");
registrar("CP819", "ISO-8859-1");
registrar("IBM819", "ISO-8859-1");
registrar("csISOLatin1", "ISO-8859-1");
diff --git a/WebCore/platform/text/TextEncoding.cpp b/WebCore/platform/text/TextEncoding.cpp
index 0a997a2..29ae170 100644
--- a/WebCore/platform/text/TextEncoding.cpp
+++ b/WebCore/platform/text/TextEncoding.cpp
@@ -248,7 +248,7 @@ const TextEncoding& ASCIIEncoding()
const TextEncoding& Latin1Encoding()
{
- static TextEncoding globalLatin1Encoding("Latin-1");
+ static TextEncoding globalLatin1Encoding("latin1");
return globalLatin1Encoding;
}
diff --git a/WebCore/platform/text/TextEncodingRegistry.cpp b/WebCore/platform/text/TextEncodingRegistry.cpp
index 6ecc36f..40fcdc5 100644
--- a/WebCore/platform/text/TextEncodingRegistry.cpp
+++ b/WebCore/platform/text/TextEncodingRegistry.cpp
@@ -61,10 +61,7 @@ namespace WebCore {
const size_t maxEncodingNameLength = 63;
-// Hash for all-ASCII strings that does case folding and skips any characters
-// that are not alphanumeric. If passed any non-ASCII characters, depends on
-// the behavior of isalnum -- if that returns false as it does on OS X, then
-// it will properly skip those characters too.
+// Hash for all-ASCII strings that does case folding.
struct TextEncodingNameHash {
static bool equal(const char* s1, const char* s2)
@@ -72,12 +69,8 @@ struct TextEncodingNameHash {
char c1;
char c2;
do {
- do
- c1 = *s1++;
- while (c1 && !isASCIIAlphanumeric(c1));
- do
- c2 = *s2++;
- while (c2 && !isASCIIAlphanumeric(c2));
+ c1 = *s1++;
+ c2 = *s2++;
if (toASCIILower(c1) != toASCIILower(c2))
return false;
} while (c1 && c2);
@@ -91,16 +84,13 @@ struct TextEncodingNameHash {
{
unsigned h = WTF::stringHashingStartValue;
for (;;) {
- char c;
- do {
- c = *s++;
- if (!c) {
- h += (h << 3);
- h ^= (h >> 11);
- h += (h << 15);
- return h;
- }
- } while (!isASCIIAlphanumeric(c));
+ char c = *s++;
+ if (!c) {
+ h += (h << 3);
+ h ^= (h >> 11);
+ h += (h << 15);
+ return h;
+ }
h += toASCIILower(c);
h += (h << 10);
h ^= (h >> 6);
@@ -154,15 +144,30 @@ static void checkExistingName(const char* alias, const char* atomicName)
&& strcmp(oldAtomicName, "ISO-8859-8-I") == 0
&& strcasecmp(atomicName, "iso-8859-8") == 0)
return;
- LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s",
- alias, oldAtomicName, atomicName);
+ LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
}
#endif
+static bool isUndesiredAlias(const char* alias)
+{
+ // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
+ for (const char* p = alias; *p; ++p) {
+ if (*p == ',')
+ return true;
+ }
+ // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
+ // problem, see bug 43554.
+ if (0 == strcmp(alias, "8859_1"))
+ return true;
+ return false;
+}
+
static void addToTextEncodingNameMap(const char* alias, const char* name)
{
ASSERT(strlen(alias) <= maxEncodingNameLength);
+ if (isUndesiredAlias(alias))
+ return;
const char* atomicName = textEncodingNameMap->get(name);
ASSERT(strcmp(alias, name) == 0 || atomicName);
if (!atomicName)
@@ -300,11 +305,9 @@ const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t leng
size_t j = 0;
for (size_t i = 0; i < length; ++i) {
UChar c = characters[i];
- if (isASCIIAlphanumeric(c)) {
- if (j == maxEncodingNameLength)
- return 0;
- buffer[j++] = c;
- }
+ if (j == maxEncodingNameLength)
+ return 0;
+ buffer[j++] = c;
}
buffer[j] = 0;
return atomicCanonicalTextEncodingName(buffer);
--
WebKit Debian packaging
More information about the Pkg-webkit-commits
mailing list