[SCM] WebKit Debian packaging branch, debian/experimental, updated. upstream/1.3.3-9427-gc2be6fc
commit-queue at webkit.org
commit-queue at webkit.org
Wed Dec 22 15:58:00 UTC 2010
The following commit has been merged in the debian/experimental branch:
commit 1e2622de3db68697adc4703d4bdcb79a3323816b
Author: commit-queue at webkit.org <commit-queue at webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Date: Wed Nov 17 09:42:41 2010 +0000
2010-11-17 Peter Varga <pvarga at inf.u-szeged.hu>
Reviewed by Gavin Barraclough.
Collect the beginning characters in a RegExp pattern for look-up
optimization
https://bugs.webkit.org/show_bug.cgi?id=45748
Extend the YARR's parser with an algorithm which collects the potential
beginning characters from a RegExp pattern for later look-up optimization.
* yarr/RegexCompiler.cpp:
(JSC::Yarr::BeginCharHelper::BeginCharHelper):
(JSC::Yarr::BeginCharHelper::addBeginChar):
(JSC::Yarr::BeginCharHelper::merge):
(JSC::Yarr::BeginCharHelper::addCharacter):
(JSC::Yarr::BeginCharHelper::linkHotTerms):
(JSC::Yarr::RegexPatternConstructor::RegexPatternConstructor):
(JSC::Yarr::RegexPatternConstructor::addBeginTerm):
(JSC::Yarr::RegexPatternConstructor::setupDisjunctionBeginTerms):
(JSC::Yarr::RegexPatternConstructor::setupAlternativeBeginTerms):
(JSC::Yarr::RegexPatternConstructor::setupBeginChars):
(JSC::Yarr::compileRegex):
* yarr/RegexPattern.h:
(JSC::Yarr::TermChain::TermChain):
(JSC::Yarr::BeginChar::BeginChar):
(JSC::Yarr::RegexPattern::RegexPattern):
(JSC::Yarr::RegexPattern::reset):
git-svn-id: http://svn.webkit.org/repository/webkit/trunk@72180 268f45cc-cd09-0410-ab3c-d52691b4dbfc
diff --git a/JavaScriptCore/ChangeLog b/JavaScriptCore/ChangeLog
index 6737862..512ae90 100644
--- a/JavaScriptCore/ChangeLog
+++ b/JavaScriptCore/ChangeLog
@@ -1,3 +1,32 @@
+2010-11-17 Peter Varga <pvarga at inf.u-szeged.hu>
+
+ Reviewed by Gavin Barraclough.
+
+ Collect the beginning characters in a RegExp pattern for look-up
+ optimization
+ https://bugs.webkit.org/show_bug.cgi?id=45748
+
+ Extend the YARR's parser with an algorithm which collects the potential
+ beginning characters from a RegExp pattern for later look-up optimization.
+
+ * yarr/RegexCompiler.cpp:
+ (JSC::Yarr::BeginCharHelper::BeginCharHelper):
+ (JSC::Yarr::BeginCharHelper::addBeginChar):
+ (JSC::Yarr::BeginCharHelper::merge):
+ (JSC::Yarr::BeginCharHelper::addCharacter):
+ (JSC::Yarr::BeginCharHelper::linkHotTerms):
+ (JSC::Yarr::RegexPatternConstructor::RegexPatternConstructor):
+ (JSC::Yarr::RegexPatternConstructor::addBeginTerm):
+ (JSC::Yarr::RegexPatternConstructor::setupDisjunctionBeginTerms):
+ (JSC::Yarr::RegexPatternConstructor::setupAlternativeBeginTerms):
+ (JSC::Yarr::RegexPatternConstructor::setupBeginChars):
+ (JSC::Yarr::compileRegex):
+ * yarr/RegexPattern.h:
+ (JSC::Yarr::TermChain::TermChain):
+ (JSC::Yarr::BeginChar::BeginChar):
+ (JSC::Yarr::RegexPattern::RegexPattern):
+ (JSC::Yarr::RegexPattern::reset):
+
2010-11-17 Sheriff Bot <webkit.review.bot at gmail.com>
Unreviewed, rolling out r72160.
diff --git a/JavaScriptCore/yarr/RegexCompiler.cpp b/JavaScriptCore/yarr/RegexCompiler.cpp
index 9f9e028..ccfc94e 100644
--- a/JavaScriptCore/yarr/RegexCompiler.cpp
+++ b/JavaScriptCore/yarr/RegexCompiler.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga at inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -235,11 +236,117 @@ private:
Vector<CharacterRange> m_rangesUnicode;
};
+struct BeginCharHelper {
+ BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
+ : m_beginChars(beginChars)
+ , m_isCaseInsensitive(isCaseInsensitive)
+ {}
+
+ void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
+ {
+ if (quantityType == QuantifierFixedCount && quantityCount > 1) {
+ // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
+ beginChar.value |= beginChar.value << 16;
+ beginChar.mask |= beginChar.mask << 16;
+ addCharacter(beginChar);
+ } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
+ // In case of characters with fixed quantifier we should check the next character as well.
+ linkHotTerms(beginChar, hotTerms);
+ else
+ // In case of greedy matching the next character checking is unnecessary therefore we just store
+ // the first character.
+ addCharacter(beginChar);
+ }
+
+ // Merge two following BeginChars in the vector to reduce the number of character checks.
+ void merge(unsigned size)
+ {
+ for (unsigned i = 0; i < size; i++) {
+ BeginChar* curr = &m_beginChars->at(i);
+ BeginChar* next = &m_beginChars->at(i + 1);
+
+ // If the current and the next size of value is different we should skip the merge process
+ // because the 16bit and 32bit values are unmergable.
+ if (curr->value <= 0xFFFF && next->value > 0xFFFF)
+ continue;
+
+ unsigned diff = curr->value ^ next->value;
+
+ curr->mask |= diff;
+ curr->value |= curr->mask;
+
+ m_beginChars->remove(i + 1);
+ size--;
+ }
+ }
+
+private:
+ void addCharacter(BeginChar beginChar)
+ {
+ unsigned pos = 0;
+ unsigned range = m_beginChars->size();
+
+ // binary chop, find position to insert char.
+ while (range) {
+ unsigned index = range >> 1;
+
+ int val = m_beginChars->at(pos+index).value - beginChar.value;
+ if (!val)
+ return;
+ if (val < 0)
+ range = index;
+ else {
+ pos += (index+1);
+ range -= (index+1);
+ }
+ }
+
+ if (pos == m_beginChars->size())
+ m_beginChars->append(beginChar);
+ else
+ m_beginChars->insert(pos, beginChar);
+ }
+
+ // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
+ void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
+ {
+ for (unsigned i = 0; i < hotTerms->size(); i++) {
+ PatternTerm hotTerm = hotTerms->at(i).term;
+ ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
+
+ UChar characterNext = hotTerm.patternCharacter;
+
+ // Append a character to an existing BeginChar object.
+ if (characterNext <= 0x7f) {
+ unsigned mask = 0;
+
+ if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
+ mask = 32;
+ characterNext = toASCIILower(characterNext);
+ }
+
+ addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
+ } else {
+ UChar upper, lower;
+ if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
+ addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
+ addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
+ } else
+ addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
+ }
+ }
+ }
+
+ Vector<BeginChar>* m_beginChars;
+ bool m_isCaseInsensitive;
+};
+
class RegexPatternConstructor {
public:
RegexPatternConstructor(RegexPattern& pattern)
: m_pattern(pattern)
, m_characterClassConstructor(pattern.m_ignoreCase)
+ , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
, m_invertParentheticalAssertion(false)
{
}
@@ -649,12 +756,153 @@ public:
loopDisjunction->m_alternatives.clear();
}
}
-
-
+
+ bool addBeginTerm(PatternTerm term, Vector<TermChain>* beginTerms, PatternAlternative* alternative, unsigned numTerms, unsigned termIndex, unsigned depth)
+ {
+ if (term.quantityType == QuantifierFixedCount) {
+ beginTerms->append(TermChain(term));
+ if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
+ setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1);
+ } else if (termIndex != numTerms - 1) {
+ beginTerms->append(TermChain(term));
+ return true;
+ }
+
+ return false;
+ }
+
+ // This function collects the terms which are potentially matching the first number of depth characters in the result.
+ // If this function returns false then it found at least one term which makes the beginning character
+ // look-up optimization inefficient.
+ bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
+ {
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
+ if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
+ return false;
+ }
+
+ return true;
+ }
+
+ bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
+ {
+ bool checkNext = true;
+ unsigned numTerms = alternative->m_terms.size();
+
+ while (checkNext && termIndex < numTerms) {
+ PatternTerm term = alternative->m_terms[termIndex];
+ checkNext = false;
+
+ switch (term.type) {
+ case PatternTerm::TypeAssertionBOL:
+ case PatternTerm::TypeAssertionEOL:
+ case PatternTerm::TypeAssertionWordBoundary:
+ return false;
+
+ case PatternTerm::TypeBackReference:
+ case PatternTerm::TypeForwardReference:
+ return false;
+
+ case PatternTerm::TypePatternCharacter:
+ if (addBeginTerm(term, beginTerms, alternative, numTerms, termIndex, depth)) {
+ termIndex++;
+ checkNext = true;
+ }
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ return false;
+
+ case PatternTerm::TypeParentheticalAssertion:
+ if (term.invertOrCapture)
+ return false;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ if (term.quantityType != QuantifierFixedCount) {
+ if (termIndex == numTerms - 1)
+ break;
+
+ termIndex++;
+ checkNext = true;
+
+ }
+
+ if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
+ return false;
+
+ break;
+ }
+ }
+
+ return true;
+ }
+
+ void setupBeginChars()
+ {
+ Vector<TermChain> beginTerms;
+ bool containsFixedCharacter = false;
+
+ if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
+ && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
+ unsigned size = beginTerms.size();
+
+ // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
+ if (!size)
+ return;
+
+ m_pattern.m_containsBeginChars = true;
+
+ for (unsigned i = 0; i < size; i++) {
+ PatternTerm term = beginTerms[i].term;
+
+ // We have just collected PatternCharacter terms, other terms are not allowed.
+ ASSERT(term.type == PatternTerm::TypePatternCharacter);
+
+ if (term.quantityType == QuantifierFixedCount)
+ containsFixedCharacter = true;
+
+ UChar character = term.patternCharacter;
+ unsigned mask = 0;
+
+ if (character <= 0x7f) {
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
+ mask = 32;
+ character = toASCIILower(character);
+ }
+
+ m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ } else {
+ UChar upper, lower;
+ if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
+ m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ } else
+ m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
+ }
+ }
+
+ // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
+ if (!containsFixedCharacter) {
+ m_pattern.m_containsBeginChars = false;
+ return;
+ }
+
+ size = m_pattern.m_beginChars.size();
+
+ if (size > 2)
+ m_beginCharHelper.merge(size - 1);
+ else if (size <= 1)
+ m_pattern.m_containsBeginChars = false;
+ }
+ }
+
private:
RegexPattern& m_pattern;
PatternAlternative* m_alternative;
CharacterClassConstructor m_characterClassConstructor;
+ BeginCharHelper m_beginCharHelper;
bool m_invertCharacterClass;
bool m_invertParentheticalAssertion;
};
@@ -687,6 +935,7 @@ const char* compileRegex(const UString& patternString, RegexPattern& pattern)
constructor.optimizeBOL();
constructor.setupOffsets();
+ constructor.setupBeginChars();
return 0;
};
diff --git a/JavaScriptCore/yarr/RegexPattern.h b/JavaScriptCore/yarr/RegexPattern.h
index eecbd43..be31fcd 100644
--- a/JavaScriptCore/yarr/RegexPattern.h
+++ b/JavaScriptCore/yarr/RegexPattern.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga at inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -283,11 +284,36 @@ CharacterClass* nondigitsCreate();
CharacterClass* nonspacesCreate();
CharacterClass* nonwordcharCreate();
+struct TermChain {
+ TermChain(PatternTerm term)
+ : term(term)
+ {}
+
+ PatternTerm term;
+ Vector<TermChain> hotTerms;
+};
+
+struct BeginChar {
+ BeginChar()
+ : value(0)
+ , mask(0)
+ {}
+
+ BeginChar(unsigned value, unsigned mask)
+ : value(value)
+ , mask(mask)
+ {}
+
+ unsigned value;
+ unsigned mask;
+};
+
struct RegexPattern {
RegexPattern(bool ignoreCase, bool multiline)
: m_ignoreCase(ignoreCase)
, m_multiline(multiline)
, m_containsBackreferences(false)
+ , m_containsBeginChars(false)
, m_containsBOL(false)
, m_numSubpatterns(0)
, m_maxBackReference(0)
@@ -313,6 +339,7 @@ struct RegexPattern {
m_maxBackReference = 0;
m_containsBackreferences = false;
+ m_containsBeginChars = false;
m_containsBOL = false;
newlineCached = 0;
@@ -327,6 +354,7 @@ struct RegexPattern {
m_disjunctions.clear();
deleteAllValues(m_userCharacterClasses);
m_userCharacterClasses.clear();
+ m_beginChars.clear();
}
bool containsIllegalBackReference()
@@ -380,12 +408,14 @@ struct RegexPattern {
bool m_ignoreCase : 1;
bool m_multiline : 1;
bool m_containsBackreferences : 1;
+ bool m_containsBeginChars : 1;
bool m_containsBOL : 1;
unsigned m_numSubpatterns;
unsigned m_maxBackReference;
PatternDisjunction* m_body;
Vector<PatternDisjunction*, 4> m_disjunctions;
Vector<CharacterClass*> m_userCharacterClasses;
+ Vector<BeginChar> m_beginChars;
private:
CharacterClass* newlineCached;
--
WebKit Debian packaging
More information about the Pkg-webkit-commits
mailing list