[SCM] WebKit Debian packaging branch, debian/experimental, updated. upstream/1.3.3-9427-gc2be6fc

Wed Dec 22 13:24:24 UTC 2010

The following commit has been merged in the debian/experimental branch:
commit 31826278238ba1efe5c8018837e3a54a500d77b9
Author: abarth at webkit.org <abarth at webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Date:   Tue Sep 14 14:25:38 2010 +0000

    2010-09-14  Adam Barth  <abarth at webkit.org>
    
            Reviewed by Darin Adler.
    
            Move adjustLexerState to the HTMLTokenizer
            https://bugs.webkit.org/show_bug.cgi?id=45649
    
            This function used let us share code between the tree builder and the
            two other users of the HTMLTokenizer.  However, now that the tree
            builder is all grown up, it can't use this code anymore.  The other
            users would much rather this code lived on HTMLTokenizer and understood
            <script>.  This patch attempts to make them happy.
    
            * html/parser/HTMLPreloadScanner.cpp:
            (WebCore::HTMLPreloadScanner::processToken):
            * html/parser/HTMLTokenizer.cpp:
            (WebCore::HTMLTokenizer::updateStateFor):
            * html/parser/HTMLTokenizer.h:
            * html/parser/HTMLTreeBuilder.cpp:
            (WebCore::HTMLTreeBuilder::scriptEnabled):
            * html/parser/HTMLTreeBuilder.h:
            * html/parser/HTMLViewSourceParser.cpp:
            (WebCore::HTMLViewSourceParser::updateTokenizerState):
    
    git-svn-id: http://svn.webkit.org/repository/webkit/trunk@67467 268f45cc-cd09-0410-ab3c-d52691b4dbfc

diff --git a/WebCore/ChangeLog b/WebCore/ChangeLog
index 81a5b87..1db8d8b 100644
--- a/WebCore/ChangeLog
+++ b/WebCore/ChangeLog
@@ -1,3 +1,27 @@
+2010-09-14  Adam Barth  <abarth at webkit.org>
+
+        Reviewed by Darin Adler.
+
+        Move adjustLexerState to the HTMLTokenizer
+        https://bugs.webkit.org/show_bug.cgi?id=45649
+
+        This function used let us share code between the tree builder and the
+        two other users of the HTMLTokenizer.  However, now that the tree
+        builder is all grown up, it can't use this code anymore.  The other
+        users would much rather this code lived on HTMLTokenizer and understood
+        <script>.  This patch attempts to make them happy.
+
+        * html/parser/HTMLPreloadScanner.cpp:
+        (WebCore::HTMLPreloadScanner::processToken):
+        * html/parser/HTMLTokenizer.cpp:
+        (WebCore::HTMLTokenizer::updateStateFor):
+        * html/parser/HTMLTokenizer.h:
+        * html/parser/HTMLTreeBuilder.cpp:
+        (WebCore::HTMLTreeBuilder::scriptEnabled):
+        * html/parser/HTMLTreeBuilder.h:
+        * html/parser/HTMLViewSourceParser.cpp:
+        (WebCore::HTMLViewSourceParser::updateTokenizerState):
+
 2010-09-14  Pierre-Antoine LaFayette  <plafayet at codeaurora.org>
 
         Reviewed by Darin Adler.
diff --git a/WebCore/html/parser/HTMLPreloadScanner.cpp b/WebCore/html/parser/HTMLPreloadScanner.cpp
index 5283fa3..9f4185b 100644
--- a/WebCore/html/parser/HTMLPreloadScanner.cpp
+++ b/WebCore/html/parser/HTMLPreloadScanner.cpp
@@ -32,7 +32,6 @@
 #include "CachedResourceLoader.h"
 #include "Document.h"
 #include "HTMLTokenizer.h"
-#include "HTMLTreeBuilder.h"
 #include "HTMLLinkElement.h"
 #include "HTMLNames.h"
 
@@ -157,13 +156,7 @@ void HTMLPreloadScanner::processToken()
         return;
 
     PreloadTask task(m_token);
-    m_tokenizer->setState(HTMLTreeBuilder::adjustedLexerState(m_tokenizer->state(), task.tagName(), m_document->frame()));
-    if (task.tagName() == scriptTag) {
-        // The tree builder handles scriptTag separately from the other tokenizer
-        // state adjustments, so we need to handle it separately too.
-        ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState);
-        m_tokenizer->setState(HTMLTokenizer::ScriptDataState);
-    }
+    m_tokenizer->updateStateFor(task.tagName(), m_document->frame());
 
     if (task.tagName() == bodyTag)
         m_bodySeen = true;
diff --git a/WebCore/html/parser/HTMLTokenizer.cpp b/WebCore/html/parser/HTMLTokenizer.cpp
index 63612e6..99bdb60 100644
--- a/WebCore/html/parser/HTMLTokenizer.cpp
+++ b/WebCore/html/parser/HTMLTokenizer.cpp
@@ -30,6 +30,7 @@
 
 #include "HTMLEntityParser.h"
 #include "HTMLToken.h"
+#include "HTMLTreeBuilder.h"
 #include "HTMLNames.h"
 #include "NotImplemented.h"
 #include <wtf/ASCIICType.h>
@@ -171,7 +172,7 @@ inline bool HTMLTokenizer::processEntity(SegmentedString& source)
 
 // Sometimes there's more complicated logic in the spec that separates when
 // we consume the next input character and when we switch to a particular
-// state.  We handle those cases by advancing the source directly and using
+// state. We handle those cases by advancing the source directly and using
 // this macro to switch to the indicated state.
 #define SWITCH_TO(stateName)                                               \
     do {                                                                   \
@@ -277,7 +278,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
 
     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
     // Note that this logic is different than the generic \r\n collapsing
-    // handled in the input stream preprocessor.  This logic is here as an
+    // handled in the input stream preprocessor. This logic is here as an
     // "authoring convenience" so folks can write:
     //
     // <pre>
@@ -1054,7 +1055,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
                 m_token->appendToAttributeValue(*iter);
         }
         // We're supposed to switch back to the attribute value state that
-        // we were in when we were switched into this state.  Rather than
+        // we were in when we were switched into this state. Rather than
         // keeping track of this explictly, we observe that the previous
         // state can be determined by m_additionalAllowedCharacter.
         if (m_additionalAllowedCharacter == '"')
@@ -1632,6 +1633,23 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
     return false;
 }
 
+void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame)
+{
+    if (tagName == textareaTag || tagName == titleTag)
+        setState(RCDATAState);
+    else if (tagName == plaintextTag)
+        setState(PLAINTEXTState);
+    else if (tagName == scriptTag)
+        setState(ScriptDataState);
+    else if (tagName == styleTag
+        || tagName == iframeTag
+        || tagName == xmpTag
+        || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame))
+        || tagName == noframesTag
+        || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame)))
+        setState(RAWTEXTState);
+}
+
 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
 {
     return vectorEqualsString(m_temporaryBuffer, expectedString);
diff --git a/WebCore/html/parser/HTMLTokenizer.h b/WebCore/html/parser/HTMLTokenizer.h
index eb6eeda..5318944 100644
--- a/WebCore/html/parser/HTMLTokenizer.h
+++ b/WebCore/html/parser/HTMLTokenizer.h
@@ -36,6 +36,7 @@
 namespace WebCore {
 
 class Element;
+class Frame;
 class HTMLToken;
 
 class HTMLTokenizer : public Noncopyable {
@@ -123,7 +124,7 @@ public:
 
     void reset();
 
-    // This function returns true if it emits a token.  Otherwise, callers
+    // This function returns true if it emits a token. Otherwise, callers
     // must provide the same (in progress) token on the next call (unless
     // they call reset() first).
     bool nextToken(SegmentedString&, HTMLToken&);
@@ -134,6 +135,22 @@ public:
     State state() const { return m_state; }
     void setState(State state) { m_state = state; }
 
+    // Updates the tokenizer's state according to the given tag name. This is
+    // an approximation of how the tree builder would update the tokenizer's
+    // state. This method is useful for approximating HTML tokenization. To
+    // get exactly the correct tokenization, you need the real tree builder.
+    //
+    // The main failures in the approximation are as follows:
+    //
+    //  * The first set of character tokens emitted for a <pre> element might
+    //    contain an extra leading newline.
+    //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
+    //    tree builder's insertion mode.
+    //  * CDATA sections in foreign content will be tokenized as bogus comments
+    //    instead of as character tokens.
+    //
+    void updateStateFor(const AtomicString& tagName, Frame*);
+
     // Hack to skip leading newline in <pre>/<listing> for authoring ease.
     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
     void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
@@ -176,8 +193,8 @@ private:
 
             // Every branch in this function is expensive, so we have a
             // fast-reject branch for characters that don't require special
-            // handling.  Please run the parser benchmark whenever you touch
-            // this function.  It's very hot.
+            // handling. Please run the parser benchmark whenever you touch
+            // this function. It's very hot.
             static const UChar specialCharacterMask = '\n' | '\r' | '\0';
             if (m_nextInputCharacter & ~specialCharacterMask) {
                 m_skipNextNewLine = false;
@@ -256,7 +273,7 @@ private:
     inline bool temporaryBufferIs(const String&);
 
     // Sometimes we speculatively consume input characters and we don't
-    // know whether they represent end tags or RCDATA, etc.  These
+    // know whether they represent end tags or RCDATA, etc. These
     // functions help manage these state.
     inline void addToPossibleEndTag(UChar cc);
     inline void saveEndTagNameIfNeeded();
@@ -268,7 +285,7 @@ private:
 
     Vector<UChar, 32> m_appropriateEndTagName;
 
-    // m_token is owned by the caller.  If nextToken is not on the stack,
+    // m_token is owned by the caller. If nextToken is not on the stack,
     // this member might be pointing to unallocated memory.
     HTMLToken* m_token;
     int m_lineNumber;
@@ -281,7 +298,7 @@ private:
     Vector<UChar, 32> m_temporaryBuffer;
 
     // We occationally want to emit both a character token and an end tag
-    // token (e.g., when lexing script).  We buffer the name of the end tag
+    // token (e.g., when lexing script). We buffer the name of the end tag
     // token here so we remember it next time we re-enter the tokenizer.
     Vector<UChar, 32> m_bufferedEndTagName;
 
diff --git a/WebCore/html/parser/HTMLTreeBuilder.cpp b/WebCore/html/parser/HTMLTreeBuilder.cpp
index 40a6651..afac2a0 100644
--- a/WebCore/html/parser/HTMLTreeBuilder.cpp
+++ b/WebCore/html/parser/HTMLTreeBuilder.cpp
@@ -425,25 +425,6 @@ PassRefPtr<Element> HTMLTreeBuilder::takeScriptToProcess(int& scriptStartLine)
     return m_scriptToProcess.release();
 }
 
-HTMLTokenizer::State HTMLTreeBuilder::adjustedLexerState(HTMLTokenizer::State state, const AtomicString& tagName, Frame* frame)
-{
-    if (tagName == textareaTag || tagName == titleTag)
-        return HTMLTokenizer::RCDATAState;
-
-    if (tagName == styleTag
-        || tagName == iframeTag
-        || tagName == xmpTag
-        || (tagName == noembedTag && pluginsEnabled(frame))
-        || tagName == noframesTag
-        || (tagName == noscriptTag && scriptEnabled(frame)))
-        return HTMLTokenizer::RAWTEXTState;
-
-    if (tagName == plaintextTag)
-        return HTMLTokenizer::PLAINTEXTState;
-
-    return state;
-}
-
 void HTMLTreeBuilder::constructTreeFromToken(HTMLToken& rawToken)
 {
     AtomicHTMLToken token(rawToken);
@@ -2778,9 +2759,7 @@ bool HTMLTreeBuilder::scriptEnabled(Frame* frame)
 {
     if (!frame)
         return false;
-    if (ScriptController* scriptController = frame->script())
-        return scriptController->canExecuteScripts(NotAboutToExecuteScript);
-    return false;
+    return frame->script()->canExecuteScripts(NotAboutToExecuteScript);
 }
 
 bool HTMLTreeBuilder::pluginsEnabled(Frame* frame)
diff --git a/WebCore/html/parser/HTMLTreeBuilder.h b/WebCore/html/parser/HTMLTreeBuilder.h
index 894f11a..d522ea8 100644
--- a/WebCore/html/parser/HTMLTreeBuilder.h
+++ b/WebCore/html/parser/HTMLTreeBuilder.h
@@ -76,8 +76,6 @@ public:
     // Done, close any open tags, etc.
     void finished();
 
-    static HTMLTokenizer::State adjustedLexerState(HTMLTokenizer::State, const AtomicString& tagName, Frame*);
-
     static bool scriptEnabled(Frame*);
     static bool pluginsEnabled(Frame*);
 
diff --git a/WebCore/html/parser/HTMLViewSourceParser.cpp b/WebCore/html/parser/HTMLViewSourceParser.cpp
index 8a7984d..f31c0a2 100644
--- a/WebCore/html/parser/HTMLViewSourceParser.cpp
+++ b/WebCore/html/parser/HTMLViewSourceParser.cpp
@@ -27,7 +27,6 @@
 #include "HTMLViewSourceParser.h"
 
 #include "HTMLNames.h"
-#include "HTMLTreeBuilder.h"
 #include "HTMLViewSourceDocument.h"
 
 namespace WebCore {
@@ -87,13 +86,7 @@ void HTMLViewSourceParser::updateTokenizerState()
         return;
 
     AtomicString tagName(m_token.name().data(), m_token.name().size());
-    m_tokenizer->setState(HTMLTreeBuilder::adjustedLexerState(m_tokenizer->state(), tagName, document()->frame()));
-    if (tagName == HTMLNames::scriptTag) {
-        // The tree builder handles scriptTag separately from the other tokenizer
-        // state adjustments, so we need to handle it separately too.
-        ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState);
-        m_tokenizer->setState(HTMLTokenizer::ScriptDataState);
-    }
+    m_tokenizer->updateStateFor(tagName, document()->frame());
 }
 
 void HTMLViewSourceParser::finish()

-- 
WebKit Debian packaging