[Pkg-mediawiki-commits] [wikidiff2] 01/05: New upstream version 1.5.1

Kunal Mehta legoktm-guest at moszumanska.debian.org
Tue Oct 31 05:36:10 UTC 2017


This is an automated email from the git hooks/post-receive script.

legoktm-guest pushed a commit to branch master
in repository wikidiff2.

commit 4bb5d6968b902a9984d09ae28be2ab2a144fe3b1
Author: Kunal Mehta <legoktm at member.fsf.org>
Date:   Mon Oct 30 22:14:10 2017 -0700

    New upstream version 1.5.1
---
 .gitignore          |  37 +++++++
 DiffEngine.h        | 104 ++++++++++++++++++
 DiffTest/test.php   |   2 +
 FuzzTest/fuzz.php   |  59 ++++++++++
 FuzzTest/random.php | 153 ++++++++++++++++++++++++++
 InlineDiff.cpp      |   8 +-
 InlineDiff.h        |   2 +-
 README              |  13 +++
 TableDiff.cpp       |  54 ++++++---
 TableDiff.h         |   2 +-
 Wikidiff2.cpp       | 311 ++++++++++++++++++++++++++++------------------------
 Wikidiff2.h         |  82 +++++++++-----
 config.m4           |   2 +-
 ext_wikidiff2.php   |   7 +-
 hhvm_wikidiff2.cpp  |  25 +++--
 php_wikidiff2.cpp   |  27 +++--
 php_wikidiff2.h     |   1 +
 tests/001.phpt      |  25 ++---
 tests/004.phpt      |   3 +-
 tests/007.phpt      | 114 +++++++++++++++++++
 tests/008.phpt      |  54 +++++++++
 textutil.h          | 180 ++++++++++++++++++++++++++++++
 22 files changed, 1039 insertions(+), 226 deletions(-)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d9257cd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,37 @@
+.svn
+*~
+*.kate-swp
+.*.swp
+.idea
+*.in
+*.lo
+*.la
+*.m4
+Makefile*
+config.*
+CMake*
+*.iml
+*.so
+
+install_manifest.txt
+.deps
+configure
+install-sh
+libtool
+ltmain.sh
+missing
+mkinstalldirs
+run-test
+run-tests.php
+
+.libs
+autom4te.cache
+build
+modules
+
+tests/*.diff
+tests/*.exp
+tests/*.log
+tests/*.out
+tests/*.php
+tests/*.sh
diff --git a/DiffEngine.h b/DiffEngine.h
index fc8fc5d..76d0c0b 100644
--- a/DiffEngine.h
+++ b/DiffEngine.h
@@ -13,12 +13,20 @@
 #include <utility>
 #include <algorithm>
 #include <cassert>
+#include <string>
+#include <numeric>
 
 #ifdef USE_JUDY
 #include "JudyHS.h"
 #endif
 
 #include "Wikidiff2.h"
+#include "Word.h"
+#include "textutil.h"
+
+// helper function to calculate similarity of text lines, based on existing diff code.
+// used in DiffEngine and Wikidiff2.
+double calculateSimilarity(TextUtil::WordVector& words1, TextUtil::WordVector& words2, long long bailoutComplexity, int *opCountPtr = nullptr);
 
 /**
  * Diff operation
@@ -138,6 +146,8 @@ class DiffEngine
 		int lcs;
 		bool done;
 		enum {MAX_CHUNKS=8};
+		void detectDissimilarChanges(PointerVector& del, PointerVector& add, Diff<T>& diff, long long bailoutComplexity);
+		bool looksLikeChange(const T& del, const T& add, long long bailoutComplexity);
 };
 
 //-----------------------------------------------------------------------------
@@ -157,6 +167,41 @@ void DiffEngine<T>::clear()
 	done = false;
 }
 
+// for a DiffOp::change, decide whether it should be treated as a successive add and delete based on similarity.
+template<typename T>
+inline bool DiffEngine<T>::looksLikeChange(const T& del, const T& add, long long bailoutComplexity)
+{
+	TextUtil::WordVector words1, words2;
+	TextUtil::explodeWords(del, words1);
+	TextUtil::explodeWords(add, words2);
+	return calculateSimilarity(words1, words2, bailoutComplexity) > 0.25;
+}
+
+// go through list of changed lines. if they are too dissimilar, convert to del+add.
+template<typename T>
+inline void DiffEngine<T>::detectDissimilarChanges(PointerVector& del, PointerVector& add, Diff<T>& diff, long long bailoutComplexity)
+{
+	int i;
+	static PointerVector empty;
+	for (i = 0; i<del.size() && i<add.size() && !looksLikeChange(*del[i], *add[i], bailoutComplexity); ++i) {
+		PointerVector d, a;
+		d.push_back(del[i]);
+		a.push_back(add[i]);
+		diff.add_edit(DiffOp<T>(DiffOp<T>::del, d, empty));
+		diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, a));
+	}
+	if (i) {
+		add.erase(add.begin(), add.begin()+i);
+		del.erase(del.begin(), del.begin()+i);
+	}
+}
+
+template<>
+inline void DiffEngine<Word>::detectDissimilarChanges(PointerVector& del, PointerVector& add, Diff<Word>& diff, long long bailoutComplexity)
+{
+	// compiles to no-op in Word specialization.
+}
+
 template<typename T>
 void DiffEngine<T>::diff (const ValueVector & from_lines,
 		const ValueVector & to_lines, Diff<T> & diff,
@@ -266,8 +311,18 @@ void DiffEngine<T>::diff (const ValueVector & from_lines,
 		while (yi < n_to && ychanged[yi])
 			add.push_back(&to_lines[yi++]);
 
+		detectDissimilarChanges(del, add, diff, bailoutComplexity);
+
 		if (del.size() && add.size())
+#ifdef DIFFENGINE__EVERY_CHANGE_IS_AN_ADD_AND_DELETE
+		// for generating a worst-case benchmark of the "show moved paragraphs" patch (gerrit change 319866)
+		{
+			diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
+			diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, add));
+		}
+#else
 			diff.add_edit(DiffOp<T>(DiffOp<T>::change, del, add));
+#endif
 		else if (del.size())
 			diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
 		else if (add.size())
@@ -601,4 +656,53 @@ Diff<T>::Diff(const ValueVector & from_lines, const ValueVector & to_lines,
 	engine.diff(from_lines, to_lines, *this, bailoutComplexity);
 }
 
+inline double calculateSimilarity(TextUtil::WordVector& words1, TextUtil::WordVector& words2, long long bailoutComplexity, int *opCountPtr /* = nullptr*/)
+{
+	typedef Diff<Word> WordDiff;
+	WordDiff diff(words1, words2, bailoutComplexity);
+	int charsTotal = 0;
+	int opCharCount[4] = { 0 };
+	double similarity;
+	auto countOpChars = [] (DiffEngine<Word>::PointerVector& p) {
+		return std::accumulate(p.begin(), p.end(), 0, [] (int a, const Word *b) {
+			return a + (b->suffixEnd - b->bodyStart);
+		});
+	};
+	for (int i = 0; i < diff.size(); ++i) {
+		int op = diff[i].op;
+		int charCount;
+		switch (diff[i].op) {
+			case DiffOp<Word>::del:
+			case DiffOp<Word>::copy:
+				charCount = countOpChars(diff[i].from);
+				break;
+			case DiffOp<Word>::add:
+				charCount = countOpChars(diff[i].to);
+				break;
+			case DiffOp<Word>::change:
+				charCount = std::max(countOpChars(diff[i].from), countOpChars(diff[i].to));
+				break;
+		}
+		opCharCount[op] += charCount;
+		charsTotal += charCount;
+	}
+	if (opCharCount[DiffOp<Word>::copy] == 0) {
+		similarity = 0.0;
+	} else {
+		if (charsTotal) {
+			similarity = double(opCharCount[DiffOp<Word>::copy]) / charsTotal;
+		} else {
+			similarity = 0.0;
+		}
+	}
+
+	if (opCountPtr) {
+		for(int i = 0; i < sizeof(opCharCount)/sizeof(opCharCount[0]); ++i) {
+			opCountPtr[i] = opCharCount[i];
+		}
+	}
+
+	return similarity;
+}
+
 #endif
diff --git a/DiffTest/test.php b/DiffTest/test.php
index a405811..a77c42c 100644
--- a/DiffTest/test.php
+++ b/DiffTest/test.php
@@ -19,8 +19,10 @@ set_error_handler( function( $errno , $errstr ) {
 } );
 
 echo <<<HTML
+<!DOCTYPE html>
 <html>
 <title>Diff changes</title>
+<meta charset="UTF-8"/>
 <style>
 body {
 	font-family: sans-serif;
diff --git a/FuzzTest/fuzz.php b/FuzzTest/fuzz.php
new file mode 100644
index 0000000..504ab9f
--- /dev/null
+++ b/FuzzTest/fuzz.php
@@ -0,0 +1,59 @@
+<?php
+/**
+ * Fuzz test.
+ */
+
+require 'random.php';
+
+if ( !function_exists( 'wikidiff2_inline_diff' ) ) {
+	die( "wikidiff2 not found, nothing to test\n" );
+}
+
+$wikidiff2Version = phpversion( 'wikidiff2' );
+if ( $wikidiff2Version !== false &&
+	version_compare( $wikidiff2Version, '0.3', '>=' ) ) {
+	echo "wikidiff2 version: $wikidiff2Version (with moved-paragraphs patch)\n";
+	$hasMovedParagraphDetection = true;
+} else {
+	echo "wikidiff2 version: $wikidiff2Version (without moved-paragraphs patch)\n";
+	$hasMovedParagraphDetection = false;
+}
+
+// Bail out early in case of any problems
+error_reporting( E_ALL | E_STRICT );
+/*set_error_handler( function( $errno , $errstr ) {
+	echo htmlspecialchars( $errstr );
+	die ( 1 );
+} );//*/
+
+echo "Performing an infinite fuzz test, press Ctrl+C to end...\n";
+
+$count = 0;
+$totalTime = 0;
+$chunkTime = 0;
+
+while ( true ) {
+	list( $left, $right ) = Random::randomData();
+
+	$contextLines = mt_rand( 0, 10 );
+	$detectionCutoff = mt_rand( 0, 100 );
+
+	$time = microtime( true );
+	if ( $hasMovedParagraphDetection ) {
+		wikidiff2_do_diff( $left, $right, $contextLines, $detectionCutoff );
+	} else {
+		wikidiff2_do_diff( $left, $right, $contextLines );
+	}
+	wikidiff2_inline_diff( $left, $right, $contextLines );
+	$time = microtime( true ) - $time;
+
+	$totalTime += $time;
+	$chunkTime += $time;
+
+	if ( ++$count % 100 == 0 ) {
+		$perIteration = round( $totalTime / $count, 3 );
+		$perIterationInChunk = round( $chunkTime / 100, 3 );
+		$chunkTime = 0;
+		echo "  $count iterations, avg. iteration time $perIteration ($perIterationInChunk last 100 iterations)\n";
+	}
+}
diff --git a/FuzzTest/random.php b/FuzzTest/random.php
new file mode 100644
index 0000000..2468e21
--- /dev/null
+++ b/FuzzTest/random.php
@@ -0,0 +1,153 @@
+<?php
+
+class Random {
+	const MAX_CONTENT_LENGTH = 2000000;
+	const MAX_LINE_LENGTH = 75000;
+	const MAX_LINES = 50000;
+	const MAX_WORD_LENGTH = 25;
+
+	private static $tables = [
+		// Numbers
+		0 => '0123456789.-',
+		// Latin
+		1 => 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRTSTUVWXYZ-',
+		// Russian
+		2 => 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя-',
+		// Thai
+		3 => 'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู',
+	];
+
+	private static $separators;
+
+	public static function randomData() {
+		self::init();
+
+		switch ( mt_rand( 0, 4 ) ) {
+			// Diff binary garbage with binary garbage
+			case 0:
+				$left = self::randomBinaryString( mt_rand( 0, self::MAX_CONTENT_LENGTH ) );
+				$right = self::randomBinaryString( mt_rand( 0, self::MAX_CONTENT_LENGTH ) );
+				break;
+			// Diff binary garbage with text
+			case 1:
+				$left = self::randomBinaryString( mt_rand( 0, self::MAX_CONTENT_LENGTH ) );
+				$right = self::randomText();
+				break;
+			case 2:
+				$left = self::randomText();
+				$right = self::randomBinaryString( mt_rand( 0, self::MAX_CONTENT_LENGTH ) );
+				break;
+			// Diff text against text
+			case 3:
+				$left = self::randomText();
+				$right = self::randomText();
+				break;
+			// Diff text against shuffled text
+			case 4:
+				$left = self::randomText();
+				$right = self::randomShuffledText( $left );
+				break;
+			default:
+				throw new Exception( 'This should not happen' );
+		}
+
+		return [ $left, $right ];
+	}
+
+	private static function init() {
+		static $initd = false;
+		if ( $initd ) {
+			return;
+		}
+
+		self::$separators = self::split( '.,?!:;。.!?。\'' );
+
+		foreach ( self::$tables as $index => $table ) {
+			self::$tables[$index] = self::split( $table );
+		}
+
+		$initd = true;
+	}
+
+	private static function split( $str ) {
+		$result = preg_split( '//u', $str );
+		array_shift( $result );
+		array_pop( $result );
+
+		return $result;
+	}
+
+	private static function randomText( $length = 0 ) {
+		if ( !$length ) {
+			$length = mt_rand( 0, self::MAX_CONTENT_LENGTH );
+		}
+
+		$str = '';
+		do {
+			$str .= self::randomLine();
+			$str .= str_repeat( "\n", mt_rand( 1, 4 ) );
+		} while ( mb_strlen( $str ) < $length );
+
+		return $str;
+	}
+
+	private static function randomShuffledText($source) {
+		$sourceLines = explode( "\n", $source );
+		$sourceLineCount = count( $sourceLines );
+		$outputLineCount = $sourceLineCount * 2;
+		$ret = "";
+		for ( $i = 0; $i < $outputLineCount; $i++ ) {
+			$ret .= $sourceLines[ mt_rand( 0, $sourceLineCount - 1 ) ];
+			$ret .= str_repeat( "\n", mt_rand( 1, 4 ) );
+		}
+		return $ret;
+	}
+
+	private static function randomLine( $length = 0 ) {
+		if ( !$length ) {
+			$length = mt_rand( 1, self::MAX_LINE_LENGTH );
+		}
+		$line = '';
+		do {
+			$line .= self::randomWord();
+			$line .= self::randomSeparator( mt_rand( 0, 3 ) );
+			$line .= str_repeat( ' ', mt_rand( 1, 10 ) );
+		} while ( strlen( $line ) < $length );
+
+		return trim( $line );
+	}
+
+	private static function randomWord( $length = 0 ) {
+		if ( !$length ) {
+			$length = mt_rand( 1, self::MAX_WORD_LENGTH );
+		}
+
+		$charset = self::$tables[mt_rand( 0, count( self::$tables ) - 1 )];
+		$str = '';
+		$chars = count( $charset );
+		for ( $i = 0; $i < $length; $i++ ) {
+			$str .= $charset[mt_rand( 0, $chars - 1 )];
+		}
+		return $str;
+	}
+
+	private static function randomSeparator( $count = 1 ) {
+		$separatorCount = count( self::$separators );
+
+		$str = '';
+		for ( $i = 0; $i < $count; $i++ ) {
+			$str .= self::$separators[mt_rand( 0, $separatorCount - 1 )];
+		}
+
+		return $str;
+	}
+
+	private static function randomBinaryString( $length ) {
+		$str = '';
+		for ( $i = 0; $i < $length; $i++ ) {
+			$str .= chr( mt_rand( 0, 255 ) );
+		}
+
+		return $str;
+	}
+}
\ No newline at end of file
diff --git a/InlineDiff.cpp b/InlineDiff.cpp
index d60215c..2969b77 100644
--- a/InlineDiff.cpp
+++ b/InlineDiff.cpp
@@ -10,15 +10,17 @@ void InlineDiff::printDelete(const String& line)
 	printWrappedLine("<div class=\"mw-diff-inline-deleted\"><del>", line, "</del></div>\n");
 }
 
-void InlineDiff::printWordDiff(const String& text1, const String& text2)
+void InlineDiff::printWordDiff(const String& text1, const String& text2, bool printLeft, bool printRight, const String & srcAnchor, const String & dstAnchor)
 {
 	WordVector words1, words2;
 
-	explodeWords(text1, words1);
-	explodeWords(text2, words2);
+	TextUtil::explodeWords(text1, words1);
+	TextUtil::explodeWords(text2, words2);
 	WordDiff worddiff(words1, words2, MAX_WORD_LEVEL_DIFF_COMPLEXITY);
 	String word;
 
+	// XXXX todo: omit left side & do strike-through according to printLeft/printRight
+
 	result += "<div class=\"mw-diff-inline-changed\">";
 	for (unsigned i = 0; i < worddiff.size(); ++i) {
 		DiffOp<Word> & op = worddiff[i];
diff --git a/InlineDiff.h b/InlineDiff.h
index 01304ca..4cff620 100644
--- a/InlineDiff.h
+++ b/InlineDiff.h
@@ -8,7 +8,7 @@ class InlineDiff: public Wikidiff2 {
 	protected:
 		void printAdd(const String& line);
 		void printDelete(const String& line);
-		void printWordDiff(const String& text1, const String& text2);
+		void printWordDiff(const String& text1, const String& text2, bool printLeft = true, bool printRight = true, const String & srcAnchor = "", const String & dstAnchor = "");
 		void printBlockHeader(int leftLine, int rightLine);
 		void printContext(const String& input);
 
diff --git a/README b/README
index 67067fe..7ac959b 100644
--- a/README
+++ b/README
@@ -15,12 +15,25 @@ These files are 2.3MB each, and give a worst-case performance test. Performance
 
 Wikidiff2 is a PHP extension.
 
+== Dependencies ==
 It requires the following library:
 
 * libthai, a Thai language support library
   http://linux.thai.net/plone/TLWG/libthai/
   On Debian-based systems, you need libthai0 and libthai-dev packages
 
+* To build wikidiff2 as a HHVM extension on Debian systems, you need the following packages:
+        hhvm-dev libtbb-dev libtbb2 libboost-all-dev libdouble-conversion-dev \
+        libdouble-conversion1 libgoogle-glog-dev libgoogle-glog-doc libgoogle-glog0 \
+        libjemalloc-dev libjemalloc1 libjemalloc1-dbg
+
+* To build wikidiff2 as a PHP extension, you need the following packages:
+** On Jessie and previous versions:
+        php5-dev pkg-config
+** On Stretch and later versions:
+        php-dev pkg-config
+
+
 == Compilation and installation with Zend PHP ==
 
 $ phpize
diff --git a/TableDiff.cpp b/TableDiff.cpp
index 5a4969f..964f1c8 100644
--- a/TableDiff.cpp
+++ b/TableDiff.cpp
@@ -23,27 +23,53 @@ void TableDiff::printDelete(const String & line)
 		"</tr>\n";
 }
 
-void TableDiff::printWordDiff(const String & text1, const String & text2)
+void TableDiff::printWordDiff(const String & text1, const String & text2, bool printLeft, bool printRight, const String & srcAnchor, const String & dstAnchor)
 {
 	WordVector words1, words2;
 
-	explodeWords(text1, words1);
-	explodeWords(text2, words2);
+	TextUtil::explodeWords(text1, words1);
+	TextUtil::explodeWords(text2, words2);
 	WordDiff worddiff(words1, words2, MAX_WORD_LEVEL_DIFF_COMPLEXITY);
 
 	//debugPrintWordDiff(worddiff);
 
-	// print twice; first for left side, then for right side
-	result += "<tr>\n"
-		"  <td class=\"diff-marker\">−</td>\n"
-		"  <td class=\"diff-deletedline\"><div>";
-	printWordDiffSide(worddiff, false);
-	result += "</div></td>\n"
-		"  <td class=\"diff-marker\">+</td>\n"
-		"  <td class=\"diff-addedline\"><div>";
-	printWordDiffSide(worddiff, true);
-	result += "</div></td>\n"
-		"</tr>\n";
+	result += "<tr>\n";
+
+	// print left side or blank placeholder.
+	if (printLeft) {
+		result += "  <td class=\"diff-marker\">";
+		if(dstAnchor != "")
+			result += "<a class=\"mw-diff-movedpara-left\" href=\"#" + dstAnchor + "\">&#x26AB;</a>";
+		else
+			result += "−";
+		result += "</td>\n";
+		result += "  <td class=\"diff-deletedline\"><div>";
+		if(srcAnchor != "")
+			result += "<a name=\"" + srcAnchor + "\"></a>";
+		printWordDiffSide(worddiff, false);
+		result += "</div></td>\n";
+	} else {
+		result += "  <td colspan=\"2\" class=\"diff-empty\"> </td>\n";
+	}
+
+	// print right side or blank placeholder.
+	if (printRight) {
+		result += "  <td class=\"diff-marker\">";
+		if(dstAnchor != "")
+			result += "<a class=\"mw-diff-movedpara-right\" href=\"#" + dstAnchor + "\">&#x26AB;</a>";
+		else
+			result += "+";
+		result += "</td>\n";
+		result += "  <td class=\"diff-addedline\"><div>";
+		if(srcAnchor != "")
+			result += "<a name=\"" + srcAnchor + "\"></a>";
+		printWordDiffSide(worddiff, true);
+		result += "</div></td>\n"
+			"</tr>\n";
+	} else {
+		result += "  <td colspan=\"2\" class=\"diff-empty\"> </td>\n"
+			"</tr>\n";
+	}
 }
 
 void TableDiff::printWordDiffSide(WordDiff &worddiff, bool added)
diff --git a/TableDiff.h b/TableDiff.h
index 0a3adc6..9ec560d 100644
--- a/TableDiff.h
+++ b/TableDiff.h
@@ -8,7 +8,7 @@ class TableDiff: public Wikidiff2 {
 	protected:
 		void printAdd(const String& line);
 		void printDelete(const String& line);
-		void printWordDiff(const String& text1, const String & text2);
+		void printWordDiff(const String& text1, const String & text2, bool printLeft = true, bool printRight = true, const String & srcAnchor = "", const String & dstAnchor = "");
 		void printTextWithDiv(const String& input);
 		void printBlockHeader(int leftLine, int rightLine);
 		void printContext(const String& input);
diff --git a/Wikidiff2.cpp b/Wikidiff2.cpp
index d579b61..8b87acc 100644
--- a/Wikidiff2.cpp
+++ b/Wikidiff2.cpp
@@ -7,14 +7,15 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <stdarg.h>
+//#define DIFFENGINE__EVERY_CHANGE_IS_AN_ADD_AND_DELETE
 #include "Wikidiff2.h"
-#include <thai/thailib.h>
-#include <thai/thwchar.h>
-#include <thai/thbrk.h>
 
 
+//#define DEBUG_MOVED_LINES
+
 void Wikidiff2::diffLines(const StringVector & lines1, const StringVector & lines2,
-		int numContextLines)
+		int numContextLines, int maxMovedLines)
 {
 	// first do line-level diff
 	StringDiff linediff(lines1, lines2);
@@ -37,7 +38,9 @@ void Wikidiff2::diffLines(const StringVector & lines1, const StringVector & line
 				// inserted lines
 				n = linediff[i].to.size();
 				for (j=0; j<n; j++) {
-					printAdd(*linediff[i].to[j]);
+					if (!printMovedLineDiff(linediff, i, j, maxMovedLines)) {
+						printAdd(*linediff[i].to[j]);
+					}
 				}
 				to_index += n;
 				break;
@@ -45,7 +48,9 @@ void Wikidiff2::diffLines(const StringVector & lines1, const StringVector & line
 				// deleted lines
 				n = linediff[i].from.size();
 				for (j=0; j<n; j++) {
-					printDelete(*linediff[i].from[j]);
+					if (!printMovedLineDiff(linediff, i, j, maxMovedLines)) {
+						printDelete(*linediff[i].from[j]);
+					}
 				}
 				from_index += n;
 				break;
@@ -93,6 +98,158 @@ void Wikidiff2::diffLines(const StringVector & lines1, const StringVector & line
 	}
 }
 
+bool Wikidiff2::printMovedLineDiff(StringDiff & linediff, int opIndex, int opLine, int maxMovedLines)
+{
+	// helper fn creates 64-bit lookup key from opIndex and opLine
+	auto makeKey = [](int index, int line) {
+		return uint64_t(index) << 32 | line;
+	};
+
+	auto makeAnchorName = [](int index, int line, bool lhs) {
+		char ch[2048];
+		snprintf(ch, sizeof(ch), "movedpara_%d_%d_%s", index, line, lhs? "lhs": "rhs");
+		return String(ch);
+	};
+
+#ifdef DEBUG_MOVED_LINES
+	auto debugPrintf = [this](const char *fmt, ...) {
+		char ch[2048];
+		va_list ap;
+		va_start(ap, fmt);
+		vsnprintf(ch, sizeof(ch), fmt, ap);
+		va_end(ap);
+
+		result += "<tr><td /><td class=\"diff-context\" colspan=3>";
+		result += ch;
+		result += "</td></tr>";
+	};
+#else
+	auto debugPrintf = [](...) { };
+#endif
+
+	if(!allowPrintMovedLineDiff(linediff, maxMovedLines)) {
+		debugPrintf("printMovedLineDiff: diff too large (maxMovedLines=%ld), not detecting moved lines", maxMovedLines);
+		return false;
+	}
+
+	debugPrintf("printMovedLineDiff (...), %d, %d\n", opIndex, opLine);
+
+	bool printLeft = linediff[opIndex].op == DiffOp<String>::del ? true : false;
+	bool printRight = !printLeft;
+
+	// check whether this op actually refers to the diff map entry
+	auto cmpDiffMapEntries = [&](int otherIndex, int otherLine) -> bool {
+		uint64_t otherKey = makeKey(otherIndex, otherLine);
+		auto it = diffMap.find(otherKey);
+		if (it != diffMap.end()) {
+			auto other = it->second;
+			bool cmp = (printLeft ?
+				other->opIndexFrom == opIndex && other->opLineFrom == opLine :
+				other->opIndexTo == opIndex && other->opLineTo == opLine);
+			if(!cmp) {
+				debugPrintf("printMovedLineDiff(..., %d, %d): not printing diff again. op=%s", opIndex, opLine,
+					linediff[opIndex].op == DiffOp<String>::add ? "add": linediff[opIndex].op == DiffOp<String>::del ? "del": "???");
+				return false;
+			}
+		}
+		return true;
+	};
+
+	// look for corresponding moved line for the opposite case in moved-line-map
+	// if moved line exists:
+	//     print diff to the moved line, omitting the left/right side for added/deleted line
+	uint64_t key = makeKey(opIndex, opLine);
+	auto it = diffMap.find(key);
+	if (it != diffMap.end()) {
+		auto best = it->second;
+		int otherIndex = linediff[opIndex].op == DiffOp<String>::add ? best->opIndexFrom : best->opIndexTo;
+		int otherLine = linediff[opIndex].op == DiffOp<String>::add ? best->opLineFrom : best->opLineTo;
+
+		if(!cmpDiffMapEntries(otherIndex, otherLine))
+			return false;
+
+		// XXXX todo: we already have the diff, don't have to do it again, just have to print it
+		printWordDiff(*linediff[best->opIndexFrom].from[best->opLineFrom], *linediff[best->opIndexTo].to[best->opLineTo],
+			printLeft, printRight, makeAnchorName(opIndex, opLine, printLeft), makeAnchorName(otherIndex, otherLine, !printLeft));
+
+		if(printLeft)
+			best->lhsDisplayed = true;
+		else
+			best->rhsDisplayed = true;
+
+		debugPrintf("found in diffmap. copy: %d, del: %d, add: %d, change: %d, similarity: %.4f\n"
+					"from: (%d,%d) to: (%d,%d)\n",
+			best->opCharCount[DiffOp<Word>::copy], best->opCharCount[DiffOp<Word>::del], best->opCharCount[DiffOp<Word>::add], best->opCharCount[DiffOp<Word>::change], best->similarity,
+			best->opIndexFrom, best->opLineFrom, best->opIndexTo, best->opLineTo);
+
+		return true;
+	}
+
+	debugPrintf("nothing found in moved-line-map");
+
+	// else:
+	//     try to find a corresponding moved line in deleted/added lines
+	int otherOp = (linediff[opIndex].op == DiffOp<String>::add ? DiffOp<String>::del : DiffOp<String>::add);
+	std::shared_ptr<DiffMapEntry> found = nullptr;
+	for (int i = 0; i < linediff.size(); ++i) {
+		if (linediff[i].op == otherOp) {
+			auto& lines = (linediff[opIndex].op == DiffOp<String>::add ? linediff[i].from : linediff[i].to);
+			for (int k = 0; k < lines.size(); ++k) {
+				WordVector words1, words2;
+				std::shared_ptr<DiffMapEntry> tmp;
+				TextUtil::explodeWords(*lines[k], words1);
+				if (otherOp == DiffOp<String>::del) {
+					TextUtil::explodeWords(*linediff[opIndex].to[opLine], words2);
+					tmp = std::make_shared<DiffMapEntry>(words1, words2, i, k, opIndex, opLine);
+				} else {
+					TextUtil::explodeWords(*linediff[opIndex].from[opLine], words2);
+					tmp = std::make_shared<DiffMapEntry>(words1, words2, opIndex, opLine, i, k);
+				}
+				if (!found || tmp->similarity > found->similarity) {
+					found= tmp;
+				}
+			}
+		}
+	}
+
+	if(found)
+		debugPrintf("candidate found with similarity %.2f", found->similarity);
+
+	// if candidate exists:
+	//     add candidate to moved-line-map twice, for add/del case
+	//     print diff to the moved line, omitting the left/right side for added/deleted line
+	if (found && found->similarity > 0.4) {
+		// if we displayed a diff to the found block before, don't display this one as moved.
+		int otherIndex = linediff[opIndex].op == DiffOp<String>::add ? found->opIndexFrom : found->opIndexTo;
+		int otherLine = linediff[opIndex].op == DiffOp<String>::add ? found->opLineFrom : found->opLineTo;
+
+		if(!cmpDiffMapEntries(otherIndex, otherLine))
+			return false;
+
+		if(printLeft)
+			found->lhsDisplayed = true;
+		else
+			found->rhsDisplayed = true;
+
+		diffMap[key] = found;
+		diffMap[makeKey(otherIndex, otherLine)] = found;
+		debugPrintf("inserting (%d,%d) + (%d,%d)", opIndex, opLine, otherIndex, otherLine);
+
+		// XXXX todo: we already have the diff, don't have to do it again, just have to print it
+		printWordDiff(*linediff[found->opIndexFrom].from[found->opLineFrom], *linediff[found->opIndexTo].to[found->opLineTo],
+			printLeft, printRight, makeAnchorName(opIndex, opLine, printLeft), makeAnchorName(otherIndex, otherLine, !printLeft));
+
+		debugPrintf("copy: %d, del: %d, add: %d, change: %d, similarity: %.4f\n"
+					"from: (%d,%d) to: (%d,%d)\n",
+			found->opCharCount[DiffOp<Word>::copy], found->opCharCount[DiffOp<Word>::del], found->opCharCount[DiffOp<Word>::add], found->opCharCount[DiffOp<Word>::change], found->similarity,
+			found->opIndexFrom, found->opLineFrom, found->opIndexTo, found->opLineTo);
+
+		return true;
+	}
+
+	return false;
+}
+
 void Wikidiff2::debugPrintWordDiff(WordDiff & worddiff)
 {
 	for (unsigned i = 0; i < worddiff.size(); ++i) {
@@ -165,144 +322,6 @@ void Wikidiff2::printText(const String & input)
 	}
 }
 
-// Weak UTF-8 decoder
-// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
-int Wikidiff2::nextUtf8Char(String::const_iterator & p, String::const_iterator & charStart,
-		String::const_iterator end)
-{
-	int c = 0;
-	unsigned char byte;
-	int seqLength = 0;
-	charStart = p;
-	if (p == end) {
-		return 0;
-	}
-	do {
-		byte = (unsigned char)*p;
-		if (byte < 0x80) {
-			c = byte;
-			seqLength = 0;
-		} else if (byte >= 0xc0) {
-			// Start of UTF-8 character
-			// If this is unexpected, due to an overshort sequence, we ignore the invalid
-			// sequence and resynchronise here
-			if (byte < 0xe0) {
-				seqLength = 1;
-				c = byte & 0x1f;
-			} else if (byte < 0xf0) {
-				seqLength = 2;
-				c = byte & 0x0f;
-			} else {
-				seqLength = 3;
-				c = byte & 7;
-			}
-		} else if (seqLength) {
-			c <<= 6;
-			c |= byte & 0x3f;
-			--seqLength;
-		} else {
-			// Unexpected continuation, ignore
-		}
-		++p;
-	} while (seqLength && p != end);
-	return c;
-}
-
-// Split a string into words
-//
-// TODO: I think the best way to do this would be to use ICU BreakIterator
-// instead of libthai + DIY. Basically you'd run BreakIterators from several
-// different locales (en, th, ja) and merge the results, i.e. if a break occurs
-// in any locale at a given position, split the string. I don't know if the
-// quality of the Thai dictionary in ICU matches the one in libthai, we would
-// have to check this somehow.
-void Wikidiff2::explodeWords(const String & text, WordVector &words)
-{
-	// Decode the UTF-8 in the string.
-	// * Save the character sizes (in bytes)
-	// * Convert the string to TIS-620, which is the internal character set of libthai.
-	// * Save the character offsets of any break positions (same format as libthai).
-
-	String tisText, charSizes;
-	String::const_iterator suffixEnd, charStart, p;
-	IntSet breaks;
-
-	tisText.reserve(text.size());
-	charSizes.reserve(text.size());
-	wchar_t ch, lastChar;
-	thchar_t thaiChar;
-	bool hasThaiChars = false;
-
-	p = text.begin();
-	ch = nextUtf8Char(p, charStart, text.end());
-	lastChar = 0;
-	int charIndex = 0;
-	while (ch) {
-		thaiChar = th_uni2tis(ch);
-		if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
-			hasThaiChars = true;
-		}
-		tisText += (char)thaiChar;
-		charSizes += (char)(p - charStart);
-
-		if (isLetter(ch)) {
-			if (lastChar && !isLetter(lastChar)) {
-				breaks.insert(charIndex);
-			}
-		} else {
-			breaks.insert(charIndex);
-		}
-		charIndex++;
-		lastChar = ch;
-		ch = nextUtf8Char(p, charStart, text.end());
-	}
-
-	// If there were any Thai characters in the string, run th_brk on it and add
-	// the resulting break positions
-	if (hasThaiChars) {
-		IntVector thaiBreakPositions;
-		tisText += '\0';
-		thaiBreakPositions.resize(tisText.size());
-		int numBreaks = th_brk((const thchar_t*)(tisText.data()),
-				&thaiBreakPositions[0], thaiBreakPositions.size());
-		thaiBreakPositions.resize(numBreaks);
-		breaks.insert(thaiBreakPositions.begin(), thaiBreakPositions.end());
-	}
-
-	// Add a fake end-of-string character and have a break on it, so that the
-	// last word gets added without special handling
-	breaks.insert(charSizes.size());
-	charSizes += (char)0;
-
-	// Now make the word array by traversing the breaks set
-	p = text.begin();
-	IntSet::iterator pBrk = breaks.begin();
-	String::const_iterator wordStart = text.begin();
-	String::const_iterator suffixStart = text.end();
-
-	// If there's a break at the start of the string, skip it
-	if (pBrk != breaks.end() && *pBrk == 0) {
-		pBrk++;
-	}
-
-	for (charIndex = 0; charIndex < charSizes.size(); p += charSizes[charIndex++]) {
-		// Assume all spaces are ASCII
-		if (isSpace(*p)) {
-			suffixStart = p;
-		}
-		if (pBrk != breaks.end() && charIndex == *pBrk) {
-			if (suffixStart == text.end()) {
-				words.push_back(Word(wordStart, p, p));
-			} else {
-				words.push_back(Word(wordStart, suffixStart, p));
-			}
-			pBrk++;
-			suffixStart = text.end();
-			wordStart = p;
-		}
-	}
-}
-
 void Wikidiff2::explodeLines(const String & text, StringVector &lines)
 {
 	String::const_iterator ptr = text.begin();
@@ -317,7 +336,7 @@ void Wikidiff2::explodeLines(const String & text, StringVector &lines)
 	}
 }
 
-const Wikidiff2::String & Wikidiff2::execute(const String & text1, const String & text2, int numContextLines)
+const Wikidiff2::String & Wikidiff2::execute(const String & text1, const String & text2, int numContextLines, int maxMovedLines)
 {
 	// Allocate some result space to avoid excessive copying
 	result.clear();
@@ -330,7 +349,7 @@ const Wikidiff2::String & Wikidiff2::execute(const String & text1, const String
 	explodeLines(text2, lines2);
 
 	// Do the diff
-	diffLines(lines1, lines2, numContextLines);
+	diffLines(lines1, lines2, numContextLines, maxMovedLines);
 
 	// Return a reference to the result buffer
 	return result;
diff --git a/Wikidiff2.h b/Wikidiff2.h
index da75e7b..909e084 100644
--- a/Wikidiff2.h
+++ b/Wikidiff2.h
@@ -14,6 +14,9 @@
 #include <string>
 #include <vector>
 #include <set>
+#include <memory>
+
+#define WIKIDIFF2_VERSION_STRING	"1.5.1"
 
 class Wikidiff2 {
 	public:
@@ -26,7 +29,7 @@ class Wikidiff2 {
 		typedef Diff<String> StringDiff;
 		typedef Diff<Word> WordDiff;
 
-		const String & execute(const String & text1, const String & text2, int numContextLines);
+		const String & execute(const String & text1, const String & text2, int numContextLines, int maxMovedLines);
 
 		inline const String & getResult() const;
 
@@ -34,53 +37,74 @@ class Wikidiff2 {
 		enum { MAX_WORD_LEVEL_DIFF_COMPLEXITY = 40000000 };
 		String result;
 
+		struct DiffMapEntry
+		{
+			double similarity;
+			int opCharCount[4] = { 0 };
+			int opIndexFrom, opLineFrom, opIndexTo, opLineTo;
+			bool lhsDisplayed = false, rhsDisplayed = false;
+
+			DiffMapEntry(WordVector& words1, WordVector& words2, int opIndexFrom_, int opLineFrom_, int opIndexTo_, int opLineTo_);
+		};
+		// PhpAllocator can't be specialized for std::pair, so we're using the standard allocator.
+		typedef std::map<uint64_t, std::shared_ptr<struct Wikidiff2::DiffMapEntry> > DiffMap;
+		DiffMap diffMap;
+
+		class AllowPrintMovedLineDiff {
+			bool detectMovedLines = true;       // will be set to false when too many 'add' or 'delete' ops appear in diff.
+			bool detectMovedLinesValid = false; // whether detectMovedLines is valid.
+			public:
+				bool operator() (StringDiff & linediff, int maxMovedLines);	// calculates & caches comparison count
+		} allowPrintMovedLineDiff;
+
 		virtual void diffLines(const StringVector & lines1, const StringVector & lines2,
-				int numContextLines);
+				int numContextLines, int maxMovedLines);
 		virtual void printAdd(const String & line) = 0;
 		virtual void printDelete(const String & line) = 0;
-		virtual void printWordDiff(const String & text1, const String & text2) = 0;
+		virtual void printWordDiff(const String & text1, const String & text2, bool printLeft = true, bool printRight = true, const String & srcAnchor = "", const String & dstAnchor = "") = 0;
 		virtual void printBlockHeader(int leftLine, int rightLine) = 0;
 		virtual void printContext(const String & input) = 0;
 
 		void printText(const String & input);
-		inline bool isLetter(int ch);
-		inline bool isSpace(int ch);
 		void debugPrintWordDiff(WordDiff & worddiff);
 
-		int nextUtf8Char(String::const_iterator & p, String::const_iterator & charStart,
-				String::const_iterator end);
-
-		void explodeWords(const String & text, WordVector &tokens);
 		void explodeLines(const String & text, StringVector &lines);
+
+		bool printMovedLineDiff(StringDiff & linediff, int opIndex, int opLine, int maxMovedLines);
 };
 
-inline bool Wikidiff2::isLetter(int ch)
+inline const Wikidiff2::String & Wikidiff2::getResult() const
 {
-	// Standard alphanumeric
-	if ((ch >= '0' && ch <= '9') ||
-	   (ch == '_') ||
-	   (ch >= 'A' && ch <= 'Z') ||
-	   (ch >= 'a' && ch <= 'z'))
-	{
-		return true;
-	}
-	// Punctuation and control characters
-	if (ch < 0xc0) return false;
-	// Chinese, Japanese: split up character by character
-	if (ch >= 0x3000 && ch <= 0x9fff) return false;
-	if (ch >= 0x20000 && ch <= 0x2a000) return false;
-	// Otherwise assume it's from a language that uses spaces
-	return true;
+	return result;
 }
 
-inline bool Wikidiff2::isSpace(int ch)
+inline Wikidiff2::DiffMapEntry::DiffMapEntry(Wikidiff2::WordVector& words1, Wikidiff2::WordVector& words2, int opIndexFrom_, int opLineFrom_, int opIndexTo_, int opLineTo_):
+	opIndexFrom(opIndexFrom_), opLineFrom(opLineFrom_), opIndexTo(opIndexTo_), opLineTo(opLineTo_)
 {
-	return ch == ' ' || ch == '\t';
+	similarity = calculateSimilarity(words1, words2, MAX_WORD_LEVEL_DIFF_COMPLEXITY, opCharCount);
 }
 
-inline const Wikidiff2::String & Wikidiff2::getResult() const
+inline bool Wikidiff2::AllowPrintMovedLineDiff::operator () (StringDiff & linediff, int maxMovedLines)
 {
-	return result;
+	if(!detectMovedLinesValid) {
+		// count the number of added or removed lines which could have been moved.
+		int adds = 0, deletes = 0;
+		for(int i = 0; i < linediff.size(); ++i) {
+			if(linediff[i].op == DiffOp<String>::add)
+				++adds;
+			if(linediff[i].op == DiffOp<String>::del)
+				++deletes;
+			// number of comparisons is (number of additions) x (number of deletions).
+			// if count is too large, don't try detecting moved lines.
+			if(adds+deletes > maxMovedLines) {
+				detectMovedLines = false;
+				break;
+			}
+		}
+		detectMovedLinesValid = true;
+	}
+	return detectMovedLines;
 }
 
+
 #endif
diff --git a/config.m4 b/config.m4
index b848398..c9e2197 100644
--- a/config.m4
+++ b/config.m4
@@ -36,6 +36,6 @@ if test "$PHP_WIKIDIFF2" != "no"; then
 
   PHP_SUBST(WIKIDIFF2_SHARED_LIBADD)
   AC_DEFINE(HAVE_WIKIDIFF2, 1, [ ])
-  export CXXFLAGS="-Wno-write-strings $CXXFLAGS"
+  export CXXFLAGS="-Wno-write-strings -std=c++11 $CXXFLAGS"
   PHP_NEW_EXTENSION(wikidiff2, php_wikidiff2.cpp Wikidiff2.cpp TableDiff.cpp InlineDiff.cpp, $ext_shared)
 fi
diff --git a/ext_wikidiff2.php b/ext_wikidiff2.php
index d69d94a..3816dec 100644
--- a/ext_wikidiff2.php
+++ b/ext_wikidiff2.php
@@ -1,6 +1,9 @@
 <?hh
 <<__Native>>
-function wikidiff2_do_diff(string $text1, string $text2, int $numContextLines): string;
+function wikidiff2_do_diff(string $text1, string $text2, int $numContextLines, int $maxMovedLines = 25): string;
 
 <<__Native>>
-function wikidiff2_inline_diff(string $text1, string $text2, int $numContextLines): string;
+function wikidiff2_inline_diff(string $text1, string $text2, int $numContextLines, int $maxMovedLines = 25): string;
+
+<<__Native>>
+function wikidiff2_version(): string;
diff --git a/hhvm_wikidiff2.cpp b/hhvm_wikidiff2.cpp
index 69bd9c8..957f950 100644
--- a/hhvm_wikidiff2.cpp
+++ b/hhvm_wikidiff2.cpp
@@ -12,7 +12,7 @@
 
 namespace HPHP {
 
-/* {{{ proto string wikidiff2_do_diff(string text1, string text2, int numContextLines)
+/* {{{ proto string wikidiff2_do_diff(string text1, string text2, int numContextLines, int maxMovedLines = 25)
  *
  * Warning: the input text must be valid UTF-8! Do not pass user input directly
  * to this function.
@@ -20,14 +20,15 @@ namespace HPHP {
 static String HHVM_FUNCTION(wikidiff2_do_diff,
 	const String& text1,
 	const String& text2,
-	int64_t numContextLines)
+	int64_t numContextLines,
+	int64_t maxMovedLines)
 {
     String result;
 	try {
 		TableDiff wikidiff2;
 		Wikidiff2::String text1String(text1.c_str());
 		Wikidiff2::String text2String(text2.c_str());
-		result = wikidiff2.execute(text1String, text2String, numContextLines);
+		result = wikidiff2.execute(text1String, text2String, numContextLines, maxMovedLines);
 	} catch (OutOfMemoryException &e) {
 		raise_error("Out of memory in wikidiff2_do_diff().");
 	} catch (...) {
@@ -36,7 +37,7 @@ static String HHVM_FUNCTION(wikidiff2_do_diff,
 	return result;
 }
 
-/* {{{ proto string wikidiff2_inline_diff(string text1, string text2, int numContextLines)
+/* {{{ proto string wikidiff2_inline_diff(string text1, string text2, int numContextLines, int maxMovedLines)
  *
  * Warning: the input text must be valid UTF-8! Do not pass user input directly
  * to this function.
@@ -44,14 +45,15 @@ static String HHVM_FUNCTION(wikidiff2_do_diff,
 static String HHVM_FUNCTION(wikidiff2_inline_diff,
 	const String& text1,
 	const String& text2,
-	int64_t numContextLines)
+	int64_t numContextLines,
+	int64_t maxMovedLines)
 {
     String result;
 	try {
 		InlineDiff wikidiff2;
 		Wikidiff2::String text1String(text1.c_str());
 		Wikidiff2::String text2String(text2.c_str());
-		result = wikidiff2.execute(text1String, text2String, numContextLines);
+		result = wikidiff2.execute(text1String, text2String, numContextLines, 0 /*inlinediff todo*/);
 	} catch (OutOfMemoryException &e) {
 		raise_error("Out of memory in wikidiff2_do_diff().");
 	} catch (...) {
@@ -60,12 +62,21 @@ static String HHVM_FUNCTION(wikidiff2_inline_diff,
 	return result;
 }
 
+/* {{{ proto string wikidiff2_version()
+ */
+static String HHVM_FUNCTION(wikidiff2_version)
+{
+    String version = WIKIDIFF2_VERSION_STRING;
+	return version;
+}
+
 static class Wikidiff2Extension : public Extension {
 	public:
-		Wikidiff2Extension() : Extension("wikidiff2") {}
+		Wikidiff2Extension() : Extension("wikidiff2", WIKIDIFF2_VERSION_STRING) {}
 		virtual void moduleInit() {
 			HHVM_FE(wikidiff2_do_diff);
 			HHVM_FE(wikidiff2_inline_diff);
+			HHVM_FE(wikidiff2_version);
 			loadSystemlib();
 		}
 } s_wikidiff2_extension;
diff --git a/php_wikidiff2.cpp b/php_wikidiff2.cpp
index e15ad2b..52179ca 100644
--- a/php_wikidiff2.cpp
+++ b/php_wikidiff2.cpp
@@ -23,6 +23,7 @@ static int le_wikidiff2;
 zend_function_entry wikidiff2_functions[] = {
 	PHP_FE(wikidiff2_do_diff,     NULL)
 	PHP_FE(wikidiff2_inline_diff, NULL)
+	PHP_FE(wikidiff2_version, NULL)
 	{NULL, NULL, NULL}
 };
 
@@ -39,12 +40,11 @@ zend_module_entry wikidiff2_module_entry = {
 	PHP_RSHUTDOWN(wikidiff2),
 	PHP_MINFO(wikidiff2),
 #if ZEND_MODULE_API_NO >= 20010901
-	"0.2",
+	WIKIDIFF2_VERSION_STRING,
 #endif
 	STANDARD_MODULE_PROPERTIES
 };
 
-
 #ifdef COMPILE_DL_WIKIDIFF2
 ZEND_GET_MODULE(wikidiff2)
 #endif
@@ -77,7 +77,7 @@ PHP_MINFO_FUNCTION(wikidiff2)
 
 }
 
-/* {{{ proto string wikidiff2_do_diff(string text1, string text2, int numContextLines)
+/* {{{ proto string wikidiff2_do_diff(string text1, string text2, int numContextLines, int maxMovedLines = 25)
  *
  * Warning: the input text must be valid UTF-8! Do not pass user input directly
  * to this function.
@@ -91,14 +91,16 @@ PHP_FUNCTION(wikidiff2_do_diff)
 	size_t text1_len;
 	size_t text2_len;
 	zend_long numContextLines;
+	zend_long maxMovedLines = 25;
 #else
 	int text1_len;
 	int text2_len;
 	long numContextLines;
+	long maxMovedLines = 25;
 #endif
 
-	if (zend_parse_parameters(argc TSRMLS_CC, "ssl", &text1, &text1_len, &text2,
-		&text2_len, &numContextLines) == FAILURE)
+	if (zend_parse_parameters(argc TSRMLS_CC, "ssl|l", &text1, &text1_len, &text2,
+		&text2_len, &numContextLines, &maxMovedLines) == FAILURE)
 	{
 		return;
 	}
@@ -108,7 +110,7 @@ PHP_FUNCTION(wikidiff2_do_diff)
 		TableDiff wikidiff2;
 		Wikidiff2::String text1String(text1, text1_len);
 		Wikidiff2::String text2String(text2, text2_len);
-		const Wikidiff2::String & ret = wikidiff2.execute(text1String, text2String, (int)numContextLines);
+		const Wikidiff2::String & ret = wikidiff2.execute(text1String, text2String, (int)numContextLines, (int)maxMovedLines);
 		COMPAT_RETURN_STRINGL( const_cast<char*>(ret.data()), ret.size());
 	} catch (std::bad_alloc &e) {
 		zend_error(E_WARNING, "Out of memory in wikidiff2_do_diff().");
@@ -117,7 +119,7 @@ PHP_FUNCTION(wikidiff2_do_diff)
 	}
 }
 
-/* {{{ proto string wikidiff2_inline_diff(string text1, string text2, int numContextLines)
+/* {{{ proto string wikidiff2_inline_diff(string text1, string text2, int numContextLines, int maxMovedLines = 25)
  *
  * Warning: the input text must be valid UTF-8! Do not pass user input directly
  * to this function.
@@ -131,10 +133,12 @@ PHP_FUNCTION(wikidiff2_inline_diff)
 	size_t text1_len;
 	size_t text2_len;
 	zend_long numContextLines;
+	zend_long maxMovedLines = 25;
 #else
 	int text1_len;
 	int text2_len;
 	long numContextLines;
+	long maxMovedLines = 25;
 #endif
 
 	if (zend_parse_parameters(argc TSRMLS_CC, "ssl", &text1, &text1_len, &text2,
@@ -148,7 +152,7 @@ PHP_FUNCTION(wikidiff2_inline_diff)
 		InlineDiff wikidiff2;
 		Wikidiff2::String text1String(text1, text1_len);
 		Wikidiff2::String text2String(text2, text2_len);
-		const Wikidiff2::String& ret = wikidiff2.execute(text1String, text2String, (int)numContextLines);
+		const Wikidiff2::String& ret = wikidiff2.execute(text1String, text2String, (int)numContextLines, 0 /*inlinediff todo*/);
 		COMPAT_RETURN_STRINGL( const_cast<char*>(ret.data()), ret.size());
 	} catch (std::bad_alloc &e) {
 		zend_error(E_WARNING, "Out of memory in wikidiff2_inline_diff().");
@@ -157,6 +161,13 @@ PHP_FUNCTION(wikidiff2_inline_diff)
 	}
 }
 
+/* {{{ proto string wikidiff2_version()
+ */
+PHP_FUNCTION(wikidiff2_version)
+{
+	COMPAT_RETURN_STRINGL( const_cast<char*>(WIKIDIFF2_VERSION_STRING), strlen(WIKIDIFF2_VERSION_STRING));
+}
+
 /* }}} */
 
 
diff --git a/php_wikidiff2.h b/php_wikidiff2.h
index 973233f..878803d 100644
--- a/php_wikidiff2.h
+++ b/php_wikidiff2.h
@@ -44,6 +44,7 @@ PHP_MINFO_FUNCTION(wikidiff2);
 
 PHP_FUNCTION(wikidiff2_do_diff);
 PHP_FUNCTION(wikidiff2_inline_diff);
+PHP_FUNCTION(wikidiff2_version);
 
 
 
diff --git a/tests/001.phpt b/tests/001.phpt
index 36195b7..716d470 100644
--- a/tests/001.phpt
+++ b/tests/001.phpt
@@ -171,8 +171,8 @@ print wikidiff2_do_diff( $x, $y, 2 );
   <td class="diff-context"><div>a</div></td>
 </tr>
 <tr>
-  <td class="diff-marker">−</td>
-  <td class="diff-deletedline"><div>---line---</div></td>
+  <td class="diff-marker"><a class="mw-diff-movedpara-left" href="#movedpara_7_0_rhs">&#x26AB;</a></td>
+  <td class="diff-deletedline"><div><a name="movedpara_5_0_lhs"></a>---line---</div></td>
   <td colspan="2" class="diff-empty"> </td>
 </tr>
 <tr>
@@ -205,8 +205,8 @@ print wikidiff2_do_diff( $x, $y, 2 );
 </tr>
 <tr>
   <td colspan="2" class="diff-empty"> </td>
-  <td class="diff-marker">+</td>
-  <td class="diff-addedline"><div>---line---</div></td>
+  <td class="diff-marker"><a class="mw-diff-movedpara-right" href="#movedpara_5_0_lhs">&#x26AB;</a></td>
+  <td class="diff-addedline"><div><a name="movedpara_7_0_rhs"></a>---line---</div></td>
 </tr>
 <tr>
   <td class="diff-marker"> </td>
@@ -237,13 +237,13 @@ print wikidiff2_do_diff( $x, $y, 2 );
   <td class="diff-context"><div>a</div></td>
 </tr>
 <tr>
-  <td class="diff-marker">−</td>
-  <td class="diff-deletedline"><div>--line1--</div></td>
+  <td class="diff-marker"><a class="mw-diff-movedpara-left" href="#movedpara_11_0_rhs">&#x26AB;</a></td>
+  <td class="diff-deletedline"><div><a name="movedpara_9_0_lhs"></a>--line1--</div></td>
   <td colspan="2" class="diff-empty"> </td>
 </tr>
 <tr>
-  <td class="diff-marker">−</td>
-  <td class="diff-deletedline"><div>--line2--</div></td>
+  <td class="diff-marker"><a class="mw-diff-movedpara-left" href="#movedpara_11_1_rhs">&#x26AB;</a></td>
+  <td class="diff-deletedline"><div><a name="movedpara_9_1_lhs"></a>--line2--</div></td>
   <td colspan="2" class="diff-empty"> </td>
 </tr>
 <tr>
@@ -276,13 +276,13 @@ print wikidiff2_do_diff( $x, $y, 2 );
 </tr>
 <tr>
   <td colspan="2" class="diff-empty"> </td>
-  <td class="diff-marker">+</td>
-  <td class="diff-addedline"><div>--line1--</div></td>
+  <td class="diff-marker"><a class="mw-diff-movedpara-right" href="#movedpara_9_0_lhs">&#x26AB;</a></td>
+  <td class="diff-addedline"><div><a name="movedpara_11_0_rhs"></a>--line1--</div></td>
 </tr>
 <tr>
   <td colspan="2" class="diff-empty"> </td>
-  <td class="diff-marker">+</td>
-  <td class="diff-addedline"><div>--line2--</div></td>
+  <td class="diff-marker"><a class="mw-diff-movedpara-right" href="#movedpara_9_1_lhs">&#x26AB;</a></td>
+  <td class="diff-addedline"><div><a name="movedpara_11_1_rhs"></a>--line2--</div></td>
 </tr>
 <tr>
   <td class="diff-marker"> </td>
@@ -396,4 +396,3 @@ print wikidiff2_do_diff( $x, $y, 2 );
   <td class="diff-marker"> </td>
   <td class="diff-context"></td>
 </tr>
-
diff --git a/tests/004.phpt b/tests/004.phpt
index 32594d4..3dc0fa5 100644
--- a/tests/004.phpt
+++ b/tests/004.phpt
@@ -32,7 +32,8 @@ print wikidiff2_inline_diff( $x, $y, 2 );
 <div class="mw-diff-inline-changed">foo <del>bar</del><ins>test</ins></div>
 <div class="mw-diff-inline-deleted"><del> </del></div>
 <div class="mw-diff-inline-context">baz</div>
-<div class="mw-diff-inline-changed"><del>quux</del><ins>test</ins></div>
+<div class="mw-diff-inline-deleted"><del>quux</del></div>
+<div class="mw-diff-inline-added"><ins>test</ins></div>
 <div class="mw-diff-inline-added"><ins> </ins></div>
 <div class="mw-diff-inline-context">bang</div>
 
diff --git a/tests/007.phpt b/tests/007.phpt
new file mode 100644
index 0000000..f37fe16
--- /dev/null
+++ b/tests/007.phpt
@@ -0,0 +1,114 @@
+--TEST--
+Diff test G: moved paragraphs
+--SKIPIF--
+<?php if (!extension_loaded("wikidiff2")) print "skip"; ?>
+--FILE--
+<?php
+$x = <<<EOT
+Substance, in the truest and primary and most definite sense of the word, is that which is neither predicable of a subject nor present in a subject; for instance, the individual man or horse. But in a secondary sense those things are called substances within which, as species, the primary substances are included; also those which, as genera, include the species. For instance, the individual man is included in the species 'man', and the genus to which the species belongs is 'animal'; thes [...]
+
+It is plain from what has been said that both the name and the definition of the predicate must be predicable of the subject. For instance, 'man' is predicated of the individual man. Now in this case the name of the species 'man' is applied to the individual, for we use the term 'man' in describing the individual; and the definition of 'man' will also be predicated of the individual man, for the individual man is both man and animal. Thus, both the name and the definition of the species  [...]
+
+With regard, on the other hand, to those things which are present in a subject, it is generally the case that neither their name nor their definition is predicable of that in which they are present. Though, however, the definition is never predicable, there is nothing in certain cases to prevent the name being used. For instance, 'white' being present in a body is predicated of that in which it is present, for a body is called white: the definition, however, of the colour 'white' is neve [...]
+
+Everything except primary substances is either predicable of a primary substance or present in a primary substance. This becomes evident by reference to particular instances which occur. 'Animal' is predicated of the species 'man', therefore of the individual man, for if there were no individual man of whom it could be predicated, it could not be predicated of the species 'man' at all. Again, colour is present in body, therefore in individual bodies, for if there were no individual body  [...]
+
+Lawns are very important. Never underestimate lawns. Never underestimate the power of hot rollers for your hair and eyelash curlers for your eyelashes. You can never underestimate the stupidity of the general public.
+
+EOT;
+
+#---------------------------------------------------
+
+$y = <<<EOT
+It is plain from what has been said that both the name and the definition of the predicate must be predicable of the subject. For instance, 'man' is predicated of the individual man. Now in this case the name of the species 'man' is applied to the individual, for we use the term 'man' in describing the individual; and the definition of 'man' will also be predicated of the individual man, for the individual man is both man and animal. Thus, both the name and the definition of the species  [...]
+
+Everything except primary substances is either predicable of a primary substance or present in a primary substance. This becomes evident by reference to particular instances which occur. 'Animal' is predicated of the species 'man', therefore of the individual man, for if there were no individual man of whom it could be predicated, it could not be predicated of the species 'man' at all. Again, colour is present in body, therefore in individual bodies, for if there were no individual body  [...]
+
+With regard, on the other hand, to those things which are present in a subject, it is generally the case that neither their name nor their definition is predicable of that in which they are present. Though, however, the definition is never predicable, there is nothing in certain cases to prevent the name being used. For instance, 'white' being present in a body is predicated of that in which it is present, for a body is called white: the definition, however, of the colour 'white' is neve [...]
+
+Lawns are very important. Never underestimate lawns. Never underestimate the power of hot rollers for your hair and eyelash curlers for your eyelashes. You can never underestimate the stupidity of the general public.
+
+Substance, in the truest and most definite sense of the word, is that which is neither predicable of a subject nor present in a subject; for instance, the individual man or horse. But in a secondary sense those things are called substances within which, as species, the primary substances are included; also those which, as genera, include the species. For instance, the individual man is included in the species 'man', and the genus to which the species belongs is 'animal'; these, therefore [...]
+
+EOT;
+
+#---------------------------------------------------
+
+print wikidiff2_do_diff( $x, $y, 2 );
+
+?>
+--EXPECT--
+<tr>
+  <td colspan="2" class="diff-lineno"><!--LINE 1--></td>
+  <td colspan="2" class="diff-lineno"><!--LINE 1--></td>
+</tr>
+<tr>
+  <td class="diff-marker"><a class="mw-diff-movedpara-left" href="#movedpara_6_1_rhs">&#x26AB;</a></td>
+  <td class="diff-deletedline"><div><a name="movedpara_0_0_lhs"></a>Substance, in the truest<del class="diffchange diffchange-inline"> and primary</del> and most definite sense of the word, is that which is neither predicable of a subject nor present in a subject; for instance, the individual man or horse. But in a secondary sense those things are called substances within which, as species, the primary substances are included; also those which, as genera, include the species. For instanc [...]
+  <td colspan="2" class="diff-empty"> </td>
+</tr>
+<tr>
+  <td class="diff-marker">−</td>
+  <td class="diff-deletedline"></td>
+  <td colspan="2" class="diff-empty"> </td>
+</tr>
+<tr>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"><div>It is plain from what has been said that both the name and the definition of the predicate must be predicable of the subject. For instance, 'man' is predicated of the individual man. Now in this case the name of the species 'man' is applied to the individual, for we use the term 'man' in describing the individual; and the definition of 'man' will also be predicated of the individual man, for the individual man is both man and animal. Thus, both the name an [...]
+  <td class="diff-marker"> </td>
+  <td class="diff-context"><div>It is plain from what has been said that both the name and the definition of the predicate must be predicable of the subject. For instance, 'man' is predicated of the individual man. Now in this case the name of the species 'man' is applied to the individual, for we use the term 'man' in describing the individual; and the definition of 'man' will also be predicated of the individual man, for the individual man is both man and animal. Thus, both the name an [...]
+</tr>
+<tr>
+  <td colspan="2" class="diff-empty"> </td>
+  <td class="diff-marker">+</td>
+  <td class="diff-addedline"></td>
+</tr>
+<tr>
+  <td colspan="2" class="diff-empty"> </td>
+  <td class="diff-marker"><a class="mw-diff-movedpara-right" href="#movedpara_4_1_lhs">&#x26AB;</a></td>
+  <td class="diff-addedline"><div><a name="movedpara_2_1_rhs"></a>Everything except primary substances is either predicable of a primary substance or present in a primary substance. This becomes evident by reference to particular instances which occur. 'Animal' is predicated of the species 'man', therefore of the individual man, for if there were no individual man of whom it could be predicated, it could not be predicated of the species 'man' at all. Again, colour is present in body, the [...]
+</tr>
+<tr>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"></td>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"></td>
+</tr>
+<tr>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"><div>With regard, on the other hand, to those things which are present in a subject, it is generally the case that neither their name nor their definition is predicable of that in which they are present. Though, however, the definition is never predicable, there is nothing in certain cases to prevent the name being used. For instance, 'white' being present in a body is predicated of that in which it is present, for a body is called white: the definition, howeve [...]
+  <td class="diff-marker"> </td>
+  <td class="diff-context"><div>With regard, on the other hand, to those things which are present in a subject, it is generally the case that neither their name nor their definition is predicable of that in which they are present. Though, however, the definition is never predicable, there is nothing in certain cases to prevent the name being used. For instance, 'white' being present in a body is predicated of that in which it is present, for a body is called white: the definition, howeve [...]
+</tr>
+<tr>
+  <td class="diff-marker">−</td>
+  <td class="diff-deletedline"></td>
+  <td colspan="2" class="diff-empty"> </td>
+</tr>
+<tr>
+  <td class="diff-marker"><a class="mw-diff-movedpara-left" href="#movedpara_2_1_rhs">&#x26AB;</a></td>
+  <td class="diff-deletedline"><div><a name="movedpara_4_1_lhs"></a>Everything except primary substances is either predicable of a primary substance or present in a primary substance. This becomes evident by reference to particular instances which occur. 'Animal' is predicated of the species 'man', therefore of the individual man, for if there were no individual man of whom it could be predicated, it could not be predicated of the species 'man' at all. Again, colour is present in body, t [...]
+  <td colspan="2" class="diff-empty"> </td>
+</tr>
+<tr>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"></td>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"></td>
+</tr>
+<tr>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"><div>Lawns are very important. Never underestimate lawns. Never underestimate the power of hot rollers for your hair and eyelash curlers for your eyelashes. You can never underestimate the stupidity of the general public.</div></td>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"><div>Lawns are very important. Never underestimate lawns. Never underestimate the power of hot rollers for your hair and eyelash curlers for your eyelashes. You can never underestimate the stupidity of the general public.</div></td>
+</tr>
+<tr>
+  <td colspan="2" class="diff-empty"> </td>
+  <td class="diff-marker">+</td>
+  <td class="diff-addedline"></td>
+</tr>
+<tr>
+  <td colspan="2" class="diff-empty"> </td>
+  <td class="diff-marker"><a class="mw-diff-movedpara-right" href="#movedpara_0_0_lhs">&#x26AB;</a></td>
+  <td class="diff-addedline"><div><a name="movedpara_6_1_rhs"></a>Substance, in the truest and most definite sense of the word, is that which is neither predicable of a subject nor present in a subject; for instance, the individual man or horse. But in a secondary sense those things are called substances within which, as species, the primary substances are included; also those which, as genera, include the species. For instance, the individual man is included in the species 'man', and th [...]
+</tr>
\ No newline at end of file
diff --git a/tests/008.phpt b/tests/008.phpt
new file mode 100644
index 0000000..f1322c9
--- /dev/null
+++ b/tests/008.phpt
@@ -0,0 +1,54 @@
+--TEST--
+Test detection of dissimilar paragraphs
+--SKIPIF--
+<?php if (!extension_loaded("wikidiff2")) print "skip"; ?>
+--FILE--
+<?php
+$x = <<<EOT
+AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA
+
+AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA
+
+EOT;
+
+#---------------------------------------------------
+
+$y = <<<EOT
+AAAAA AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB
+
+AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB
+
+EOT;
+
+#---------------------------------------------------
+
+print wikidiff2_do_diff( $x, $y, 2 );
+
+?>
+--EXPECT--
+<tr>
+  <td colspan="2" class="diff-lineno"><!--LINE 1--></td>
+  <td colspan="2" class="diff-lineno"><!--LINE 1--></td>
+</tr>
+<tr>
+  <td class="diff-marker">−</td>
+  <td class="diff-deletedline"><div>AAAAA AAAAA <del class="diffchange diffchange-inline">AAAAA</del> <del class="diffchange diffchange-inline">AAAAA</del> <del class="diffchange diffchange-inline">AAAAA</del> <del class="diffchange diffchange-inline">AAAAA</del> <del class="diffchange diffchange-inline">AAAAA</del> <del class="diffchange diffchange-inline">AAAAA</del> <del class="diffchange diffchange-inline">AAAAA</del> <del class="diffchange diffchange-inline">AAAAA</del></div></td>
+  <td class="diff-marker">+</td>
+  <td class="diff-addedline"><div>AAAAA AAAAA <ins class="diffchange diffchange-inline">BBBBB</ins> <ins class="diffchange diffchange-inline">BBBBB</ins> <ins class="diffchange diffchange-inline">BBBBB</ins> <ins class="diffchange diffchange-inline">BBBBB</ins> <ins class="diffchange diffchange-inline">BBBBB</ins> <ins class="diffchange diffchange-inline">BBBBB</ins> <ins class="diffchange diffchange-inline">BBBBB</ins> <ins class="diffchange diffchange-inline">BBBBB</ins></div></td>
+</tr>
+<tr>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"></td>
+  <td class="diff-marker"> </td>
+  <td class="diff-context"></td>
+</tr>
+<tr>
+  <td class="diff-marker">−</td>
+  <td class="diff-deletedline"><div>AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA</div></td>
+  <td colspan="2" class="diff-empty"> </td>
+</tr>
+<tr>
+  <td colspan="2" class="diff-empty"> </td>
+  <td class="diff-marker">+</td>
+  <td class="diff-addedline"><div>AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB</div></td>
+</tr>
diff --git a/textutil.h b/textutil.h
new file mode 100644
index 0000000..6031380
--- /dev/null
+++ b/textutil.h
@@ -0,0 +1,180 @@
+#ifndef TEXTUTIL_H
+#define TEXTUTIL_H
+
+#include <thai/thailib.h>
+#include <thai/thwchar.h>
+#include <thai/thbrk.h>
+
+namespace TextUtil
+{
+	typedef std::basic_string<char, std::char_traits<char>, WD2_ALLOCATOR<char> > String;
+	typedef std::vector<Word, WD2_ALLOCATOR<Word> > WordVector;
+	typedef std::set<int, std::less<int>, WD2_ALLOCATOR<int> > IntSet;
+	typedef std::vector<int, WD2_ALLOCATOR<int> > IntVector;
+
+	// helper functions used in both DiffEngine and Wikidiff2
+
+	inline bool isLetter(int ch)
+	{
+		// Standard alphanumeric
+		if ((ch >= '0' && ch <= '9') ||
+		   (ch == '_') ||
+		   (ch >= 'A' && ch <= 'Z') ||
+		   (ch >= 'a' && ch <= 'z'))
+		{
+			return true;
+		}
+		// Punctuation and control characters
+		if (ch < 0xc0) return false;
+		// Chinese, Japanese: split up character by character
+		if (ch >= 0x3000 && ch <= 0x9fff) return false;
+		if (ch >= 0x20000 && ch <= 0x2a000) return false;
+		// Otherwise assume it's from a language that uses spaces
+		return true;
+	}
+
+	inline bool isSpace(int ch)
+	{
+		return ch == ' ' || ch == '\t';
+	}
+
+	// Weak UTF-8 decoder
+	// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
+	inline int nextUtf8Char(String::const_iterator & p, String::const_iterator & charStart,
+			String::const_iterator end)
+	{
+		int c = 0;
+		unsigned char byte;
+		int seqLength = 0;
+		charStart = p;
+		if (p == end) {
+			return 0;
+		}
+		do {
+			byte = (unsigned char)*p;
+			if (byte < 0x80) {
+				c = byte;
+				seqLength = 0;
+			} else if (byte >= 0xc0) {
+				// Start of UTF-8 character
+				// If this is unexpected, due to an overshort sequence, we ignore the invalid
+				// sequence and resynchronise here
+				if (byte < 0xe0) {
+					seqLength = 1;
+					c = byte & 0x1f;
+				} else if (byte < 0xf0) {
+					seqLength = 2;
+					c = byte & 0x0f;
+				} else {
+					seqLength = 3;
+					c = byte & 7;
+				}
+			} else if (seqLength) {
+				c <<= 6;
+				c |= byte & 0x3f;
+				--seqLength;
+			} else {
+				// Unexpected continuation, ignore
+			}
+			++p;
+		} while (seqLength && p != end);
+		return c;
+	}
+
+	// Split a string into words
+	//
+	// TODO: I think the best way to do this would be to use ICU BreakIterator
+	// instead of libthai + DIY. Basically you'd run BreakIterators from several
+	// different locales (en, th, ja) and merge the results, i.e. if a break occurs
+	// in any locale at a given position, split the string. I don't know if the
+	// quality of the Thai dictionary in ICU matches the one in libthai, we would
+	// have to check this somehow.
+	inline void explodeWords(const String & text, WordVector &words)
+	{
+		// Decode the UTF-8 in the string.
+		// * Save the character sizes (in bytes)
+		// * Convert the string to TIS-620, which is the internal character set of libthai.
+		// * Save the character offsets of any break positions (same format as libthai).
+
+		String tisText, charSizes;
+		String::const_iterator suffixEnd, charStart, p;
+		IntSet breaks;
+
+		tisText.reserve(text.size());
+		charSizes.reserve(text.size());
+		wchar_t ch, lastChar;
+		thchar_t thaiChar;
+		bool hasThaiChars = false;
+
+		p = text.begin();
+		ch = nextUtf8Char(p, charStart, text.end());
+		lastChar = 0;
+		int charIndex = 0;
+		while (ch) {
+			thaiChar = th_uni2tis(ch);
+			if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
+				hasThaiChars = true;
+			}
+			tisText += (char)thaiChar;
+			charSizes += (char)(p - charStart);
+
+			if (isLetter(ch)) {
+				if (lastChar && !isLetter(lastChar)) {
+					breaks.insert(charIndex);
+				}
+			} else {
+				breaks.insert(charIndex);
+			}
+			charIndex++;
+			lastChar = ch;
+			ch = nextUtf8Char(p, charStart, text.end());
+		}
+
+		// If there were any Thai characters in the string, run th_brk on it and add
+		// the resulting break positions
+		if (hasThaiChars) {
+			IntVector thaiBreakPositions;
+			tisText += '\0';
+			thaiBreakPositions.resize(tisText.size());
+			int numBreaks = th_brk((const thchar_t*)(tisText.data()),
+					&thaiBreakPositions[0], thaiBreakPositions.size());
+			thaiBreakPositions.resize(numBreaks);
+			breaks.insert(thaiBreakPositions.begin(), thaiBreakPositions.end());
+		}
+
+		// Add a fake end-of-string character and have a break on it, so that the
+		// last word gets added without special handling
+		breaks.insert(charSizes.size());
+		charSizes += (char)0;
+
+		// Now make the word array by traversing the breaks set
+		p = text.begin();
+		IntSet::iterator pBrk = breaks.begin();
+		String::const_iterator wordStart = text.begin();
+		String::const_iterator suffixStart = text.end();
+
+		// If there's a break at the start of the string, skip it
+		if (pBrk != breaks.end() && *pBrk == 0) {
+			pBrk++;
+		}
+
+		for (charIndex = 0; charIndex < charSizes.size(); p += charSizes[charIndex++]) {
+			// Assume all spaces are ASCII
+			if (isSpace(*p)) {
+				suffixStart = p;
+			}
+			if (pBrk != breaks.end() && charIndex == *pBrk) {
+				if (suffixStart == text.end()) {
+					words.push_back(Word(wordStart, p, p));
+				} else {
+					words.push_back(Word(wordStart, suffixStart, p));
+				}
+				pBrk++;
+				suffixStart = text.end();
+				wordStart = p;
+			}
+		}
+	}
+}
+
+#endif // TEXTUTIL_H

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/collab-maint/wikidiff2.git



More information about the Pkg-mediawiki-commits mailing list