[SCM] Development for GoFind! branch, master, updated. 56b0cd6ccd531af549aebd4f767ff77a21cd8949

Wed May 6 15:46:19 UTC 2009

The following commit has been merged in the master branch:
commit 56b0cd6ccd531af549aebd4f767ff77a21cd8949
Author: Miriam Ruiz <miriam at debian.org>
Date:   Wed May 6 17:53:33 2009 +0200

    Improved parsing support for HTML entities in UTF8
    Reestructured UTF8 related code

diff --git a/Makefile b/Makefile
index 06fc801..c432a64 100644
--- a/Makefile
+++ b/Makefile
@@ -26,17 +26,20 @@ PKGCONFIG_CFLAGS= `pkg-config $(PKGCONFIG_FILES) --cflags`
 DEFS=-DUSE_GETTEXT
 STATIC_CFLAGS= -O2 -g -Wall $(DEFS)
 CFLAGS= $(STATIC_CFLAGS) -fPIC
+TEST_CFLAGS= -Wall -Werror $(CFLAGS)
 LDFLAGS= -Wl,-z,defs -Wl,--as-needed -Wl,--no-undefined
 PLUGINS_LDFLAGS= 
 LIBS= -lept -lept-core -lapt-pkg -lxapian -ldl `pkg-config $(PKGCONFIG_FILES) --libs`
 
 OBJS= Engine.o Environment.o filter.o field.o gofind.o \
 	taghandler.o cfgmanager.o boolparser.o apthelper.o \
-	utf8.o dll.o guiplugin.o pkgdata.o slre.o i18n.o
+	dll.o guiplugin.o pkgdata.o slre.o i18n.o \
+	utf8/parser.o utf8/html.o
 
 LIB_OBJS= Engine.o Environment.o filter.o field.o \
 	taghandler.o cfgmanager.o boolparser.o apthelper.o \
-	utf8.o dll.o guiplugin.o pkgdata.o slre.o i18n.o
+	dll.o guiplugin.o pkgdata.o slre.o i18n.o \
+	utf8/parser.o utf8/html.o 
 
 HEADERS=$(shell find . -name "*.h")
 
@@ -89,24 +92,25 @@ gui_lua.o: gui_lua.cpp
 gui_luagtk.o: gui_luagtk.cpp
 	g++ -o $@ -c $< $(CFLAGS) $(PKGCONFIG_CFLAGS) `pkg-config lua5.1 --cflags`
 
-%.o: %.cpp $(HEADERS)
+%.o: %.cpp $(HEADERS) Makefile
 	g++ -o $@ -c $< $(CFLAGS) $(PKGCONFIG_CFLAGS)
 
-%.o: %.c $(HEADERS)
+%.o: %.c $(HEADERS) Makefile
 	gcc -o $@ -c $< $(CFLAGS) $(PKGCONFIG_CFLAGS)
 
 %.so : %.o
 	g++ $(LDFLAGS) -shared $^ -o $@
 
-%.static.o: %.cpp $(HEADERS)
+%.static.o: %.cpp $(HEADERS) Makefile
 	g++ -o $@ -c $< $(STATIC_CFLAGS) $(PKGCONFIG_CFLAGS)
 
-%.static.o: %.c $(HEADERS)
+%.static.o: %.c $(HEADERS) Makefile
 	gcc -o $@ -c $< $(STATIC_CFLAGS) $(PKGCONFIG_CFLAGS)
 
 TEST_OBJS= filter.test.o taghandler.test.o cfgmanager.test.o \
-	boolparser.test.o slre.test.o utf8.test.o apthelper.test.o \
-	dll.test.o i18n.test.o utf8/testutf8.test.o CuTest.o test.o
+	boolparser.test.o slre.test.o apthelper.test.o \
+	dll.test.o i18n.test.o utf8/testutf8.test.o \
+	CuTest.o test.o utf8/parser.test.o utf8/html.test.o 
 TEST_PLUGINS= testplugin.test.so
 
 test: $(TEST_OBJS) $(TEST_PLUGINS)
@@ -115,17 +119,17 @@ test: $(TEST_OBJS) $(TEST_PLUGINS)
 test.c:
 	sh CuTest.sh > $@
 
-test.o: test.c $(HEADERS)
-	gcc -o $@ -DUNIT_TEST -c $< $(CFLAGS) $(PKGCONFIG_CFLAGS)
+test.o: test.c $(HEADERS) Makefile
+	gcc -o $@ -DUNIT_TEST -c $< $(TEST_CFLAGS) $(PKGCONFIG_CFLAGS)
 
-%.test.o: %.cpp $(HEADERS)
-	g++ -o $@ -DUNIT_TEST -c $< -Wall -Werror $(CFLAGS) $(PKGCONFIG_CFLAGS)
+%.test.o: %.cpp $(HEADERS) Makefile
+	g++ -o $@ -DUNIT_TEST -c $< $(TEST_CFLAGS) $(PKGCONFIG_CFLAGS)
 
-%.test.o: %.c $(HEADERS)
-	gcc -o $@ -DUNIT_TEST -c $< -Wall -Werror $(CFLAGS) $(PKGCONFIG_CFLAGS)
+%.test.o: %.c $(HEADERS) Makefile
+	gcc -o $@ -DUNIT_TEST -c $< $(TEST_CFLAGS) $(PKGCONFIG_CFLAGS)
 
 %.test.so : %.test.o
 	g++ $(CFLAGS) $(PKGCONFIG_CFLAGS) -DUNIT_TEST -shared $^ -o $@
 
 clean:
-	rm -f gofind test test.c cli/*.o fltk/*.o *.o *.so *.so* *.a
+	rm -f gofind test test.c utf8/*.o cli/*.o fltk/*.o *.o *.so *.so* *.a
diff --git a/boolparser.cpp b/boolparser.cpp
index 461de07..b3462e8 100644
--- a/boolparser.cpp
+++ b/boolparser.cpp
@@ -23,10 +23,6 @@
 #include "common.h"
 #include "boolparser.h"
 
-#ifdef USE_UTF8
-#include "utf8.h"
-#endif
-
 #include <cassert>
 #include <memory>
 #include <cstring>
diff --git a/gofind.cpp b/gofind.cpp
index b0413b4..dfd3d6b 100644
--- a/gofind.cpp
+++ b/gofind.cpp
@@ -2,7 +2,7 @@
  * debtags - Implement package tags support for Debian
  *
  * Copyright (C) 2007  Enrico Zini <enrico at debian.org>
- * Copyright (C) 2007, 2008  Miriam Ruiz <little_miry at yahoo.es>
+ * Copyright (C) 2007-2009  Miriam Ruiz <little_miry at yahoo.es>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/slre.c b/slre.c
index ac50590..b8bd4a9 100644
--- a/slre.c
+++ b/slre.c
@@ -21,6 +21,8 @@
 /*
  * Copyright (c) 2004-2005 Sergey Lyubka <valenok at gmail.com>
  * All rights reserved
+ * 
+ * Webpage: http://slre.sourceforge.net/
  *
  * "THE BEER-WARE LICENSE" (Revision 42):
  * Sergey Lyubka wrote this file.  As long as you retain this notice you
@@ -28,15 +30,29 @@
  * this stuff is worth it, you can buy me a beer in return.
  */
 
-/* Webpage: http://slre.sourceforge.net/ */
+/* 
+ *
+ * Functions helper_utf8_to_unicode_char, helper_unicode_is_space and helper_unicode_is_digit
+ * 
+ * helper_utf8.c - Raptor UTF-8 and Unicode support
+ *
+ * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
+ * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ *   2. GNU General Public License (GPL) V2 or any newer version
+ *   3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ */
 
 #include "common.h"
 #include "slre.h"
 
-#ifdef USE_UTF8
-#include "utf8.h"
-#endif
-
 #include <stdio.h>
 #include <assert.h>
 #include <ctype.h>
@@ -48,6 +64,172 @@
 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
 	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
 
+#ifdef USE_UTF8
+/**
+ * helper_utf8_to_unicode_char:
+ * @output: Pointer to the Unicode character or NULL
+ * @input: UTF-8 string buffer
+ * @length: buffer size
+ *
+ * Convert an UTF-8 encoded buffer to a Unicode character.
+ *
+ * If output is NULL, then will calculate the number of bytes that
+ * will be used from the input buffer and not perform the conversion.
+ *
+ * Return value: bytes used from input buffer or <0 on failure:
+ *  -1 input buffer too short or length error
+ *  -2 overlong UTF-8 sequence
+ *  -3 illegal code positions
+ *  -4 code out of range U+0000 to U+10FFFF.
+ *  In cases -2, -3 and -4 the coded character is stored in the output.
+ */
+static int helper_utf8_to_unicode_char(unsigned long *output, const unsigned char *input, int length)
+{
+	unsigned char in;
+	int size;
+	unsigned long c=0;
+
+	if(length < 1)
+		return -1;
+
+	in=*input++;
+	if((in & 0x80) == 0)
+	{
+		size=1;
+		c= in & 0x7f;
+	}
+	else if((in & 0xe0) == 0xc0)
+	{
+		size=2;
+		c= in & 0x1f;
+	}
+	else if((in & 0xf0) == 0xe0)
+	{
+		size=3;
+		c= in & 0x0f;
+	}
+	else if((in & 0xf8) == 0xf0)
+	{
+		size=4;
+		c = in & 0x07;
+	}
+	else if((in & 0xfc) == 0xf8)
+	{
+		size=5;
+		c = in & 0x03;
+	}
+	else if((in & 0xfe) == 0xfc)
+	{
+		size=6;
+		c = in & 0x01;
+	} else
+	return -1;
+
+	if(!output)
+		return size;
+
+	if(length < size)
+		return -1;
+
+	switch(size)
+	{
+		case 6:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		case 5:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		case 4:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		case 3:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		case 2:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		default:
+			break;
+	}
+
+	*output=c;
+
+	/* check for overlong UTF-8 sequences */
+	switch(size)
+	{
+		case 2:
+			if(c < 0x00000080)
+				return -2;
+			break;
+		case 3:
+			if(c < 0x00000800)
+				return -2;
+			break;
+		case 4:
+			if(c < 0x00010000)
+				return -2;
+			break;
+
+		default:				 /* 1 */
+			break;
+	}
+
+	/* check for illegal code positions:
+	 * U+D800 to U+DFFF (UTF-16 surrogates)
+	 * U+FFFE and U+FFFF
+	 */
+	if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
+		return -3;
+
+	/* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
+	/* of course this makes some 4 byte forms illegal */
+	if(c > 0x10ffff)
+		return -4;
+
+	return size;
+}
+
+static int helper_unicode_is_space(long c)
+{
+	return((c == 0x0020) ||		 /* Space */
+		(c == 0x000C) ||		 /* Page jump: \f */
+		(c == 0x000D) ||		 /* Carriage return: \r */
+		(c == 0x000A) ||		 /* Next line: \n */
+		(c == 0x0009) ||		 /* Horizontal tab: \t */
+		(c == 0x000B) );		 /* Vertical tab \v */
+}
+
+static int helper_unicode_is_digit(long c)
+{
+	/* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
+	return((c >= 0x0030 && c <= 0x0039 ) ||
+		(c >= 0x0660 && c <= 0x0669 ) ||
+		(c >= 0x06F0 && c <= 0x06F9 ) ||
+		(c >= 0x0966 && c <= 0x096F ) ||
+		(c >= 0x09E6 && c <= 0x09EF ) ||
+		(c >= 0x0A66 && c <= 0x0A6F ) ||
+		(c >= 0x0AE6 && c <= 0x0AEF ) ||
+		(c >= 0x0B66 && c <= 0x0B6F ) ||
+		(c >= 0x0BE7 && c <= 0x0BEF ) ||
+		(c >= 0x0C66 && c <= 0x0C6F ) ||
+		(c >= 0x0CE6 && c <= 0x0CEF ) ||
+		(c >= 0x0D66 && c <= 0x0D6F ) ||
+		(c >= 0x0E50 && c <= 0x0E59 ) ||
+		(c >= 0x0ED0 && c <= 0x0ED9 ) ||
+		(c >= 0x0F20 && c <= 0x0F29 ));
+}
+#endif // USE_UTF8
+
 static struct {
 	const char	*name;
 	int		narg;
diff --git a/utf8.c b/utf8.c
deleted file mode 100644
index 40a5532..0000000
--- a/utf8.c
+++ /dev/null
@@ -1,768 +0,0 @@
-/* 
- * helper_utf8.c - Raptor UTF-8 and Unicode support
- * 
- * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
- * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
- * 
- * This package is Free Software and part of Redland http://librdf.org/
- * 
- * It is licensed under the following three licenses as alternatives:
- *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
- *   2. GNU General Public License (GPL) V2 or any newer version
- *   3. Apache License, V2.0 or any newer version
- * 
- * You may not use this file except in compliance with at least one of
- * the above three licenses.
- */
-
-#include "common.h"
-#include "utf8.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-
-
-/**
- * helper_unicode_char_to_utf8:
- * @c: Unicode character
- * @output: UTF-8 string buffer or NULL
- *
- * Convert a Unicode character to UTF-8 encoding.
- * 
- * Based on librdf_unicode_char_to_utf8() with no need to calculate
- * length since the encoded character is always copied into a buffer
- * with sufficient size.
- * 
- * Return value: bytes encoded to output buffer or <0 on failure
- **/
-int helper_unicode_char_to_utf8(unsigned long c, unsigned char *output)
-{
-  int size=0;
-  
-  if      (c < 0x00000080)
-    size=1;
-  else if (c < 0x00000800)
-    size=2;
-  else if (c < 0x00010000)
-    size=3;
-  else if (c < 0x00200000)
-    size=4;
-  else if (c < 0x04000000)
-    size=5;
-  else if (c < 0x80000000)
-    size=6;
-  else
-    return -1;
-
-  switch(size) {
-    case 6:
-      output[5]=0x80 | (unsigned char)(c & 0x3F);
-      c= c >> 6;
-       /* set bit 2 (bits 7,6,5,4,3,2 less 7,6,5,4,3 set below) on last byte */
-      c |= 0x4000000; /* 0x10000 = 0x04 << 24 */
-      /* FALLTHROUGH */
-    case 5:
-      output[4]=0x80 | (unsigned char)(c & 0x3F);
-      c= c >> 6;
-       /* set bit 3 (bits 7,6,5,4,3 less 7,6,5,4 set below) on last byte */
-      c |= 0x200000; /* 0x10000 = 0x08 << 18 */
-      /* FALLTHROUGH */
-    case 4:
-      output[3]=0x80 | (unsigned char)(c & 0x3F);
-      c= c >> 6;
-       /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
-      c |= 0x10000; /* 0x10000 = 0x10 << 12 */
-      /* FALLTHROUGH */
-    case 3:
-      output[2]=0x80 | (unsigned char)(c & 0x3F);
-      c= c >> 6;
-      /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
-      c |= 0x800; /* 0x800 = 0x20 << 6 */
-      /* FALLTHROUGH */
-    case 2:
-      output[1]=0x80 | (unsigned char)(c & 0x3F);
-      c= c >> 6;
-      /* set bits 7,6 on last byte */
-      c |= 0xc0; 
-      /* FALLTHROUGH */
-    case 1:
-      output[0]=(unsigned char)c;
-  }
-
-  return size;
-}
-
-
-/**
- * helper_utf8_to_unicode_char:
- * @output: Pointer to the Unicode character or NULL
- * @input: UTF-8 string buffer
- * @length: buffer size
- *
- * Convert an UTF-8 encoded buffer to a Unicode character.
- * 
- * If output is NULL, then will calculate the number of bytes that
- * will be used from the input buffer and not perform the conversion.
- * 
- * Return value: bytes used from input buffer or <0 on failure:
- *  -1 input buffer too short or length error
- *  -2 overlong UTF-8 sequence
- *  -3 illegal code positions
- *  -4 code out of range U+0000 to U+10FFFF.
- *  In cases -2, -3 and -4 the coded character is stored in the output.
- */
-int helper_utf8_to_unicode_char(unsigned long *output,
-                            const unsigned char *input, int length)
-{
-  unsigned char in;
-  int size;
-  unsigned long c=0;
-  
-  if(length < 1)
-    return -1;
-
-  in=*input++;
-  if((in & 0x80) == 0) {
-    size=1;
-    c= in & 0x7f;
-  } else if((in & 0xe0) == 0xc0) {
-    size=2;
-    c= in & 0x1f;
-  } else if((in & 0xf0) == 0xe0) {
-    size=3;
-    c= in & 0x0f;
-  } else if((in & 0xf8) == 0xf0) {
-    size=4;
-    c = in & 0x07;
-  } else if((in & 0xfc) == 0xf8) {
-    size=5;
-    c = in & 0x03;
-  } else if((in & 0xfe) == 0xfc) {
-    size=6;
-    c = in & 0x01;
-  } else
-    return -1;
-
-
-  if(!output)
-    return size;
-
-  if(length < size)
-    return -1;
-
-  switch(size) {
-    case 6:
-      in=*input++ & 0x3f;
-      c= c << 6;
-      c |= in;
-      /* FALLTHROUGH */
-    case 5:
-      in=*input++ & 0x3f;
-      c= c << 6;
-      c |= in;
-      /* FALLTHROUGH */
-    case 4:
-      in=*input++ & 0x3f;
-      c= c << 6;
-      c |= in;
-      /* FALLTHROUGH */
-    case 3:
-      in=*input++ & 0x3f;
-      c= c << 6;
-      c |= in;
-      /* FALLTHROUGH */
-    case 2:
-      in=*input++ & 0x3f;
-      c= c << 6;
-      c |= in;
-      /* FALLTHROUGH */
-    default:
-      break;
-  }
-  
-  *output=c;
-
-  /* check for overlong UTF-8 sequences */
-  switch(size) {
-    case 2:
-      if(c < 0x00000080)
-        return -2;
-      break;
-    case 3:
-      if(c < 0x00000800)
-        return -2;
-      break;
-    case 4:
-      if(c < 0x00010000)
-        return -2;
-      break;
-
-    default: /* 1 */
-      break;
-  }
-
-
-  /* check for illegal code positions:
-   * U+D800 to U+DFFF (UTF-16 surrogates)
-   * U+FFFE and U+FFFF
-   */
-  if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
-    return -3;
-
-  /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
-  /* of course this makes some 4 byte forms illegal */
-  if(c > 0x10ffff)
-    return -4;
-
-  return size;
-}
-
-
-/**
- * helper_unicode_is_xml11_namestartchar:
- * @c: Unicode character to check
- * 
- * Check if Unicode character is legal to start an XML 1.1 Name
- * 
- * Namespaces in XML 1.1 REC 2004-02-04
- *   http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
- * updating
- *   Extensible Markup Language (XML) 1.1 REC 2004-02-04
- *   http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
- * excluding the ':'
- *
- * Return value: non-0 if legal
- **/
-int helper_unicode_is_xml11_namestartchar(long c)
-{
-  return (((c >= 0x0041)  && (c <= 0x005A)) || /* [A-Z] */
-          (c == 0x005F) ||                     /* '_' */
-          ((c >= 0x0061)  && (c <= 0x007A)) || /* [a-z] */
-          ((c >= 0x00C0)  && (c <= 0x00D6)) ||
-          ((c >= 0x00D8)  && (c <= 0x00F6)) ||
-          ((c >= 0x00F8)  && (c <= 0x02FF)) ||
-          ((c >= 0x0370)  && (c <= 0x037D)) ||
-          ((c >= 0x037F)  && (c <= 0x1FFF)) ||
-          ((c >= 0x200C)  && (c <= 0x200D)) ||
-          ((c >= 0x2070)  && (c <= 0x218F)) ||
-          ((c >= 0x2C00)  && (c <= 0x2FEF)) ||
-          ((c >= 0x3001)  && (c <= 0xD7FF)) ||
-          ((c >= 0xF900)  && (c <= 0xFDCF)) ||
-          ((c >= 0xFDF0)  && (c <= 0xFFFD)) ||
-          ((c >= 0x10000) && (c <= 0xEFFFF)));
-}
-
-
-/**
- * helper_unicode_is_xml10_namestartchar:
- * @c: Unicode character to check
- *
- * Check if Unicode character is legal to start an XML 1.0 Name
- * 
- * Namespaces in XML REC 1999-01-14
- *   http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
- * updating
- *   Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
- *   http://www.w3.org/TR/2004/REC-xml-20040204/
- * excluding the ':'
- *
- * Return value: non-0 if legal
- **/
-int helper_unicode_is_xml10_namestartchar(long c)
-{
-  return (helper_unicode_is_letter(c) ||
-          (c == '_'));
-}
-
-
-/**
- * helper_unicode_is_xml11_namechar:
- * @c: Unicode character
- * 
- * Check if a Unicode codepoint is a legal to continue an XML 1.1 Name
- *
- * Namespaces in XML 1.1 REC 2004-02-04
- *   http://www.w3.org/TR/2004/REC-xml11-20040204/
- * updating
- *   Extensible Markup Language (XML) 1.1 REC 2004-02-04
- *   http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
- * excluding the ':'
- * 
- * Return value: non-0 if legal
- **/
-int helper_unicode_is_xml11_namechar(long c)
-{
-  return (helper_unicode_is_xml11_namestartchar(c) ||
-          (c == 0x002D) || /* '-' */
-          (c == 0x002E) || /* '.' */
-          (c >= 0x0030 && c <= 0x0039) || /* 0-9 */
-          (c == 0x00B7) ||
-          (c >= 0x0300 && c <=0x036F) ||
-          (c >= 0x203F && c <=0x2040));
-}
-
-
-/**
- * helper_unicode_is_xml10_namechar:
- * @c: Unicode character
- * 
- * Check if a Unicode codepoint is a legal to continue an XML 1.0 Name
- * 
- * Namespaces in XML REC 1999-01-14
- *   http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCNameChar
- * updating
- *   Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
- *   http://www.w3.org/TR/2004/REC-xml-20040204/
- * excluding the ':'
- *
- * Return value: non-0 if legal
- **/
-int helper_unicode_is_xml10_namechar(long c)
-{
-  return (helper_unicode_is_letter(c) ||
-          helper_unicode_is_digit(c) ||
-          (c == 0x002E) || /* '.' */
-          (c == 0x002D) || /* '-' */
-          (c == 0x005F) || /* '_' */
-          helper_unicode_is_combiningchar(c) ||
-          helper_unicode_is_extender(c));
-}
-
-
-/*
- * All this below was derived by machine-transforming the classes in Appendix B
- * of http://www.w3.org/TR/2000/REC-xml-20001006
- */
-
-int helper_unicode_is_letter(long c)
-{
-  return(helper_unicode_is_basechar(c) ||
-         helper_unicode_is_ideographic(c));
-}
-
-
-int helper_unicode_is_basechar(long c)
-{
-  /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-BaseChar */
-  return((c >= 0x0041 && c <= 0x005A ) ||
-         (c >= 0x0061 && c <= 0x007A ) ||
-         (c >= 0x00C0 && c <= 0x00D6 ) ||
-         (c >= 0x00D8 && c <= 0x00F6 ) ||
-         (c >= 0x00F8 && c <= 0x00FF ) ||
-         (c >= 0x0100 && c <= 0x0131 ) ||
-         (c >= 0x0134 && c <= 0x013E ) ||
-         (c >= 0x0141 && c <= 0x0148 ) ||
-         (c >= 0x014A && c <= 0x017E ) ||
-         (c >= 0x0180 && c <= 0x01C3 ) ||
-         (c >= 0x01CD && c <= 0x01F0 ) ||
-         (c >= 0x01F4 && c <= 0x01F5 ) || 
-         (c >= 0x01FA && c <= 0x0217 ) ||
-         (c >= 0x0250 && c <= 0x02A8 ) ||
-         (c >= 0x02BB && c <= 0x02C1 ) ||
-         (c == 0x0386) || 
-         (c >= 0x0388 && c <= 0x038A ) ||
-         (c == 0x038C) ||
-         (c >= 0x038E && c <= 0x03A1 ) ||
-         (c >= 0x03A3 && c <= 0x03CE ) ||
-         (c >= 0x03D0 && c <= 0x03D6 ) ||
-         (c == 0x03DA) ||
-         (c == 0x03DC) ||
-         (c == 0x03DE) ||
-         (c == 0x03E0) ||
-         (c >= 0x03E2 && c <= 0x03F3 ) ||
-         (c >= 0x0401 && c <= 0x040C ) ||
-         (c >= 0x040E && c <= 0x044F ) ||
-         (c >= 0x0451 && c <= 0x045C ) ||
-         (c >= 0x045E && c <= 0x0481 ) ||
-         (c >= 0x0490 && c <= 0x04C4 ) ||
-         (c >= 0x04C7 && c <= 0x04C8 ) ||
-         (c >= 0x04CB && c <= 0x04CC ) ||
-         (c >= 0x04D0 && c <= 0x04EB ) ||
-         (c >= 0x04EE && c <= 0x04F5 ) ||
-         (c >= 0x04F8 && c <= 0x04F9 ) ||
-         (c >= 0x0531 && c <= 0x0556 ) ||
-         (c == 0x0559) ||
-         (c >= 0x0561 && c <= 0x0586 ) ||
-         (c >= 0x05D0 && c <= 0x05EA ) ||
-         (c >= 0x05F0 && c <= 0x05F2 ) ||
-         (c >= 0x0621 && c <= 0x063A ) ||
-         (c >= 0x0641 && c <= 0x064A ) ||
-         (c >= 0x0671 && c <= 0x06B7 ) ||
-         (c >= 0x06BA && c <= 0x06BE ) ||
-         (c >= 0x06C0 && c <= 0x06CE ) ||
-         (c >= 0x06D0 && c <= 0x06D3 ) ||
-         (c == 0x06D5) ||
-         (c >= 0x06E5 && c <= 0x06E6 ) ||
-         (c >= 0x0905 && c <= 0x0939 ) ||
-         (c == 0x093D) ||
-         (c >= 0x0958 && c <= 0x0961 ) ||
-         (c >= 0x0985 && c <= 0x098C ) ||
-         (c >= 0x098F && c <= 0x0990 ) ||
-         (c >= 0x0993 && c <= 0x09A8 ) ||
-         (c >= 0x09AA && c <= 0x09B0 ) ||
-         (c == 0x09B2) ||
-         (c >= 0x09B6 && c <= 0x09B9 ) ||
-         (c >= 0x09DC && c <= 0x09DD ) ||
-         (c >= 0x09DF && c <= 0x09E1 ) ||
-         (c >= 0x09F0 && c <= 0x09F1 ) ||
-         (c >= 0x0A05 && c <= 0x0A0A ) ||
-         (c >= 0x0A0F && c <= 0x0A10 ) ||
-         (c >= 0x0A13 && c <= 0x0A28 ) ||
-         (c >= 0x0A2A && c <= 0x0A30 ) ||
-         (c >= 0x0A32 && c <= 0x0A33 ) ||
-         (c >= 0x0A35 && c <= 0x0A36 ) ||
-         (c >= 0x0A38 && c <= 0x0A39 ) ||
-         (c >= 0x0A59 && c <= 0x0A5C ) || 
-         (c == 0x0A5E) ||
-         (c >= 0x0A72 && c <= 0x0A74 ) ||
-         (c >= 0x0A85 && c <= 0x0A8B ) ||
-         (c == 0x0A8D) ||
-	 (c >= 0x0A8F && c <= 0x0A91 ) ||
-         (c >= 0x0A93 && c <= 0x0AA8 ) ||
-         (c >= 0x0AAA && c <= 0x0AB0 ) ||
-         (c >= 0x0AB2 && c <= 0x0AB3 ) ||
-	 (c >= 0x0AB5 && c <= 0x0AB9 ) ||
-         (c == 0x0ABD) ||
-         (c == 0x0AE0) ||
-         (c >= 0x0B05 && c <= 0x0B0C ) ||
-	 (c >= 0x0B0F && c <= 0x0B10 ) ||
-         (c >= 0x0B13 && c <= 0x0B28 ) ||
-         (c >= 0x0B2A && c <= 0x0B30 ) ||
-         (c >= 0x0B32 && c <= 0x0B33 ) ||
-	 (c >= 0x0B36 && c <= 0x0B39 ) ||
-         (c == 0x0B3D) ||
-         (c >= 0x0B5C && c <= 0x0B5D ) ||
-         (c >= 0x0B5F && c <= 0x0B61 ) ||
-	 (c >= 0x0B85 && c <= 0x0B8A ) ||
-         (c >= 0x0B8E && c <= 0x0B90 ) ||
-         (c >= 0x0B92 && c <= 0x0B95 ) ||
-         (c >= 0x0B99 && c <= 0x0B9A ) ||
-	 (c == 0x0B9C) ||
-         (c >= 0x0B9E && c <= 0x0B9F ) ||
-         (c >= 0x0BA3 && c <= 0x0BA4 ) ||
-         (c >= 0x0BA8 && c <= 0x0BAA ) ||
-	 (c >= 0x0BAE && c <= 0x0BB5 ) ||
-         (c >= 0x0BB7 && c <= 0x0BB9 ) ||
-         (c >= 0x0C05 && c <= 0x0C0C ) ||
-         (c >= 0x0C0E && c <= 0x0C10 ) ||
-	 (c >= 0x0C12 && c <= 0x0C28 ) ||
-         (c >= 0x0C2A && c <= 0x0C33 ) ||
-         (c >= 0x0C35 && c <= 0x0C39 ) ||
-         (c >= 0x0C60 && c <= 0x0C61 ) ||
-	 (c >= 0x0C85 && c <= 0x0C8C ) ||
-         (c >= 0x0C8E && c <= 0x0C90 ) ||
-         (c >= 0x0C92 && c <= 0x0CA8 ) ||
-         (c >= 0x0CAA && c <= 0x0CB3 ) ||
-	 (c >= 0x0CB5 && c <= 0x0CB9 ) ||
-         (c == 0x0CDE) ||
-         (c >= 0x0CE0 && c <= 0x0CE1 ) ||
-         (c >= 0x0D05 && c <= 0x0D0C ) ||
-	 (c >= 0x0D0E && c <= 0x0D10 ) ||
-         (c >= 0x0D12 && c <= 0x0D28 ) ||
-         (c >= 0x0D2A && c <= 0x0D39 ) ||
-         (c >= 0x0D60 && c <= 0x0D61 ) ||
-	 (c >= 0x0E01 && c <= 0x0E2E ) ||
-         (c == 0x0E30) ||
-         (c >= 0x0E32 && c <= 0x0E33 ) ||
-         (c >= 0x0E40 && c <= 0x0E45 ) ||
-	 (c >= 0x0E81 && c <= 0x0E82 ) ||
-         (c == 0x0E84) ||
-         (c >= 0x0E87 && c <= 0x0E88 ) ||
-         (c == 0x0E8A) ||
-	 (c == 0x0E8D) ||
-         (c >= 0x0E94 && c <= 0x0E97 ) ||
-         (c >= 0x0E99 && c <= 0x0E9F ) ||
-         (c >= 0x0EA1 && c <= 0x0EA3 ) ||
-	 (c == 0x0EA5) ||
-         (c == 0x0EA7) ||
-         (c >= 0x0EAA && c <= 0x0EAB ) ||
-         (c >= 0x0EAD && c <= 0x0EAE ) ||
-	 (c == 0x0EB0) ||
-         (c >= 0x0EB2 && c <= 0x0EB3 ) ||
-         (c == 0x0EBD) ||
-         (c >= 0x0EC0 && c <= 0x0EC4 ) ||
-	 (c >= 0x0F40 && c <= 0x0F47 ) ||
-         (c >= 0x0F49 && c <= 0x0F69 ) ||
-         (c >= 0x10A0 && c <= 0x10C5 ) ||
-         (c >= 0x10D0 && c <= 0x10F6 ) ||
-	 (c == 0x1100) ||
-         (c >= 0x1102 && c <= 0x1103 ) ||
-         (c >= 0x1105 && c <= 0x1107 ) ||
-         (c == 0x1109) ||
-         (c >= 0x110B && c <= 0x110C ) ||
-         (c >= 0x110E && c <= 0x1112 ) ||
-         (c == 0x113C) ||
-         (c == 0x113E) ||
-         (c == 0x1140) ||
-         (c == 0x114C) ||
-         (c == 0x114E) ||
-         (c == 0x1150) ||
-         (c >= 0x1154 && c <= 0x1155 ) ||
-         (c == 0x1159) ||
-         (c >= 0x115F && c <= 0x1161 ) ||
-         (c == 0x1163) ||
-         (c == 0x1165) ||
-         (c == 0x1167) ||
-	 (c == 0x1169) ||
-         (c >= 0x116D && c <= 0x116E ) ||
-         (c >= 0x1172 && c <= 0x1173 ) ||
-         (c == 0x1175) ||
-	 (c == 0x119E) ||
-         (c == 0x11A8) ||
-         (c == 0x11AB) ||
-         (c >= 0x11AE && c <= 0x11AF ) ||
-         (c >= 0x11B7 && c <= 0x11B8 ) ||
-	 (c == 0x11BA) ||
-         (c >= 0x11BC && c <= 0x11C2 ) ||
-         (c == 0x11EB) ||
-         (c == 0x11F0) ||
-         (c == 0x11F9) ||
-	 (c >= 0x1E00 && c <= 0x1E9B ) ||
-         (c >= 0x1EA0 && c <= 0x1EF9 ) ||
-         (c >= 0x1F00 && c <= 0x1F15 ) ||
-         (c >= 0x1F18 && c <= 0x1F1D ) ||
-	 (c >= 0x1F20 && c <= 0x1F45 ) ||
-         (c >= 0x1F48 && c <= 0x1F4D ) ||
-         (c >= 0x1F50 && c <= 0x1F57 ) ||
-         (c == 0x1F59) ||
-	 (c == 0x1F5B) ||
-         (c == 0x1F5D) ||
-         (c >= 0x1F5F && c <= 0x1F7D ) ||
-         (c >= 0x1F80 && c <= 0x1FB4 ) ||
-	 (c >= 0x1FB6 && c <= 0x1FBC ) ||
-         (c == 0x1FBE) ||
-         (c >= 0x1FC2 && c <= 0x1FC4 ) ||
-         (c >= 0x1FC6 && c <= 0x1FCC ) ||
-	 (c >= 0x1FD0 && c <= 0x1FD3 ) ||
-         (c >= 0x1FD6 && c <= 0x1FDB ) ||
-         (c >= 0x1FE0 && c <= 0x1FEC ) ||
-         (c >= 0x1FF2 && c <= 0x1FF4 ) ||
-	 (c >= 0x1FF6 && c <= 0x1FFC ) ||
-         (c == 0x2126) ||
-         (c >= 0x212A && c <= 0x212B ) ||
-         (c == 0x212E) ||
-	 (c >= 0x2180 && c <= 0x2182 ) ||
-         (c >= 0x3041 && c <= 0x3094 ) ||
-         (c >= 0x30A1 && c <= 0x30FA ) ||
-         (c >= 0x3105 && c <= 0x312C ) ||
-         (c >= 0xAC00 && c <= 0xD7A3 ) 
-         );
-}
-
-   
-int helper_unicode_is_ideographic(long c)
-{
-  /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Ideographic */
-  return((c >= 0x4E00 && c <= 0x9FA5 ) ||
-         (c == 0x3007) ||
-         (c >= 0x3021 && c <= 0x3029 ));
-}
-
-
-int helper_unicode_is_combiningchar(long c)
-{
-  /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-CombiningChar */
-  return((c >= 0x0300 && c <= 0x0345 ) ||
-         (c >= 0x0360 && c <= 0x0361 ) ||
-         (c >= 0x0483 && c <= 0x0486 ) ||
-         (c >= 0x0591 && c <= 0x05A1 ) ||
-         (c >= 0x05A3 && c <= 0x05B9 ) ||
-         (c >= 0x05BB && c <= 0x05BD ) ||
-         (c == 0x05BF) ||
-         (c >= 0x05C1 && c <= 0x05C2 ) ||
-         (c == 0x05C4) ||
-         (c >= 0x064B && c <= 0x0652 ) ||
-         (c == 0x0670) ||
-         (c >= 0x06D6 && c <= 0x06DC ) ||
-	 (c >= 0x06DD && c <= 0x06DF ) ||
-         (c >= 0x06E0 && c <= 0x06E4 ) ||
-         (c >= 0x06E7 && c <= 0x06E8 ) ||
-         (c >= 0x06EA && c <= 0x06ED ) ||
-	 (c >= 0x0901 && c <= 0x0903 ) ||
-         (c == 0x093C) ||
-         (c >= 0x093E && c <= 0x094C ) ||
-         (c == 0x094D) ||
-	 (c >= 0x0951 && c <= 0x0954 ) ||
-         (c >= 0x0962 && c <= 0x0963 ) ||
-         (c >= 0x0981 && c <= 0x0983 ) ||
-         (c == 0x09BC) ||
-	 (c == 0x09BE) ||
-         (c == 0x09BF) ||
-         (c >= 0x09C0 && c <= 0x09C4 ) ||
-         (c >= 0x09C7 && c <= 0x09C8 ) ||
-	 (c >= 0x09CB && c <= 0x09CD ) ||
-         (c == 0x09D7) ||
-         (c >= 0x09E2 && c <= 0x09E3 ) ||
-         (c == 0x0A02) ||
-	 (c == 0x0A3C) ||
-         (c == 0x0A3E) ||
-         (c == 0x0A3F) ||
-         (c >= 0x0A40 && c <= 0x0A42 ) ||
-         (c >= 0x0A47 && c <= 0x0A48 ) ||
-	 (c >= 0x0A4B && c <= 0x0A4D ) ||
-         (c >= 0x0A70 && c <= 0x0A71 ) ||
-         (c >= 0x0A81 && c <= 0x0A83 ) ||
-         (c == 0x0ABC) ||
-	 (c >= 0x0ABE && c <= 0x0AC5 ) ||
-         (c >= 0x0AC7 && c <= 0x0AC9 ) ||
-         (c >= 0x0ACB && c <= 0x0ACD ) ||
-         (c >= 0x0B01 && c <= 0x0B03 ) ||
-	 (c == 0x0B3C) ||
-         (c >= 0x0B3E && c <= 0x0B43 ) ||
-         (c >= 0x0B47 && c <= 0x0B48 ) ||
-         (c >= 0x0B4B && c <= 0x0B4D ) ||
-	 (c >= 0x0B56 && c <= 0x0B57 ) ||
-         (c >= 0x0B82 && c <= 0x0B83 ) ||
-         (c >= 0x0BBE && c <= 0x0BC2 ) ||
-         (c >= 0x0BC6 && c <= 0x0BC8 ) ||
-	 (c >= 0x0BCA && c <= 0x0BCD ) ||
-         (c == 0x0BD7) ||
-         (c >= 0x0C01 && c <= 0x0C03 ) ||
-         (c >= 0x0C3E && c <= 0x0C44 ) ||
-	 (c >= 0x0C46 && c <= 0x0C48 ) ||
-         (c >= 0x0C4A && c <= 0x0C4D ) ||
-         (c >= 0x0C55 && c <= 0x0C56 ) ||
-         (c >= 0x0C82 && c <= 0x0C83 ) ||
-	 (c >= 0x0CBE && c <= 0x0CC4 ) ||
-         (c >= 0x0CC6 && c <= 0x0CC8 ) ||
-         (c >= 0x0CCA && c <= 0x0CCD ) ||
-         (c >= 0x0CD5 && c <= 0x0CD6 ) ||
-	 (c >= 0x0D02 && c <= 0x0D03 ) ||
-         (c >= 0x0D3E && c <= 0x0D43 ) ||
-         (c >= 0x0D46 && c <= 0x0D48 ) ||
-         (c >= 0x0D4A && c <= 0x0D4D ) ||
-	 (c == 0x0D57) ||
-         (c == 0x0E31) ||
-         (c >= 0x0E34 && c <= 0x0E3A ) ||
-         (c >= 0x0E47 && c <= 0x0E4E ) ||
-	 (c == 0x0EB1) ||
-         (c >= 0x0EB4 && c <= 0x0EB9 ) ||
-         (c >= 0x0EBB && c <= 0x0EBC ) ||
-         (c >= 0x0EC8 && c <= 0x0ECD ) ||
-	 (c >= 0x0F18 && c <= 0x0F19 ) ||
-         (c == 0x0F35) ||
-         (c == 0x0F37) ||
-         (c == 0x0F39) ||
-         (c == 0x0F3E) ||
-	 (c == 0x0F3F) ||
-         (c >= 0x0F71 && c <= 0x0F84 ) ||
-         (c >= 0x0F86 && c <= 0x0F8B ) ||
-         (c >= 0x0F90 && c <= 0x0F95 ) ||
-	 (c == 0x0F97) ||
-         (c >= 0x0F99 && c <= 0x0FAD ) ||
-         (c >= 0x0FB1 && c <= 0x0FB7 ) ||
-         (c == 0x0FB9) ||
-	 (c >= 0x20D0 && c <= 0x20DC ) ||
-         (c == 0x20E1) ||
-         (c >= 0x302A && c <= 0x302F ) ||
-         (c == 0x3099) ||
-	 (c == 0x309A));
-}
-
-
-int helper_unicode_is_digit(long c)
-{
-  /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
-  return((c >= 0x0030 && c <= 0x0039 ) ||
-         (c >= 0x0660 && c <= 0x0669 ) ||
-         (c >= 0x06F0 && c <= 0x06F9 ) ||
-         (c >= 0x0966 && c <= 0x096F ) ||
-         (c >= 0x09E6 && c <= 0x09EF ) ||
-         (c >= 0x0A66 && c <= 0x0A6F ) ||
-         (c >= 0x0AE6 && c <= 0x0AEF ) ||
-         (c >= 0x0B66 && c <= 0x0B6F ) ||
-         (c >= 0x0BE7 && c <= 0x0BEF ) ||
-         (c >= 0x0C66 && c <= 0x0C6F ) ||
-         (c >= 0x0CE6 && c <= 0x0CEF ) ||
-         (c >= 0x0D66 && c <= 0x0D6F ) ||
-         (c >= 0x0E50 && c <= 0x0E59 ) ||
-         (c >= 0x0ED0 && c <= 0x0ED9 ) ||
-         (c >= 0x0F20 && c <= 0x0F29 ));
-}
-
-
-int helper_unicode_is_extender(long c)
-{
-  /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Extender */
-  return((c == 0x00B7) ||
-         (c == 0x02D0) ||
-         (c == 0x02D1) ||
-         (c == 0x0387) ||
-         (c == 0x0640) ||
-         (c == 0x0E46) || 
-         (c == 0x0EC6) ||
-         (c == 0x3005) ||
-         (c >= 0x3031 && c <= 0x3035 ) ||
-         (c >= 0x309D && c <= 0x309E ) ||
-         (c >= 0x30FC && c <= 0x30FE ));
-}
-
-
-int helper_unicode_is_space(long c)
-{
-  return((c == 0x0020) || /* Space */
-         (c == 0x000C) || /* Page jump: \f */
-         (c == 0x000D) || /* Carriage return: \r */
-         (c == 0x000A) || /* Next line: \n */
-         (c == 0x0009) || /* Horizontal tab: \t */
-         (c == 0x000B) ); /* Vertical tab \v */
-}
-
-
-/**
- * helper_utf8_is_nfc:
- * @input: UTF-8 string
- * @length: length of string
- *
- * Check a string is in Unicode Normal Form C.
- * 
- * Return value: Non 0 if the string is NFC
- **/
-int helper_utf8_is_nfc(const unsigned char *input, size_t length)
-{
-  unsigned int i;
-  int plain=1;
-  
-  for(i=0; i<length; i++)
-    if(input[i]>0x7f) {
-      plain=0;
-      break;
-    }
-    
-  if(plain)
-    return 1;
-
-#ifdef helper_NFC_CHECK  
-  return helper_nfc_check(input, length, NULL);
-#else
-  return 1;
-#endif
-}
-
-
-/**
- * helper_utf8_check:
- * @string: UTF-8 string
- * @length: length of string
- *
- * Check a string is UTF-8.
- * 
- * Return value: Non 0 if the string is UTF-8
- **/
-int helper_utf8_check(const unsigned char *string, size_t length)
-{
-  while(length > 0) {
-    unsigned long unichar=0;
-
-    int unichar_len=helper_utf8_to_unicode_char(&unichar, string, length);
-    if(unichar_len < 0 || unichar_len > (int)length)
-      return 0;
-
-    if(unichar > 0x10ffff)
-      return 0;
-  
-    string += unichar_len;
-    length -= unichar_len;
-  }
-  return 1;
-}
diff --git a/utf8.h b/utf8.h
deleted file mode 100644
index 7d39644..0000000
--- a/utf8.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* 
- * helper_utf8.c - Raptor UTF-8 and Unicode support
- * 
- * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
- * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
- * 
- * This package is Free Software and part of Redland http://librdf.org/
- * 
- * It is licensed under the following three licenses as alternatives:
- *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
- *   2. GNU General Public License (GPL) V2 or any newer version
- *   3. Apache License, V2.0 or any newer version
- * 
- * You may not use this file except in compliance with at least one of
- * the above three licenses.
- */
-
-#ifndef _GOPLAY_UTF8_H
-#define _GOPLAY_UTF8_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-int helper_unicode_char_to_utf8(unsigned long c, unsigned char *output);
-int helper_utf8_to_unicode_char(unsigned long *output,
-                            const unsigned char *input, int length);
-
-int helper_unicode_is_letter(long c);
-int helper_unicode_is_basechar(long c);
-int helper_unicode_is_ideographic(long c);
-int helper_unicode_is_combiningchar(long c);
-int helper_unicode_is_digit(long c);
-int helper_unicode_is_extender(long c);
-
-int helper_unicode_is_space(long c);
-
-int helper_unicode_is_xml11_namestartchar(long c);
-int helper_unicode_is_xml10_namestartchar(long c);
-int helper_unicode_is_xml11_namechar(long c);
-int helper_unicode_is_xml10_namechar(long c);
-
-int helper_utf8_is_nfc(const unsigned char *input, size_t length);
-int helper_utf8_check(const unsigned char *string, size_t length);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // _GOPLAY_UTF8_H
diff --git a/utf8/checked.h b/utf8/checked.h
index b57c4dd..4ce6cdf 100644
--- a/utf8/checked.h
+++ b/utf8/checked.h
@@ -172,18 +172,6 @@ namespace utf8
         return next(temp, end);
     }
 
-    /// Deprecated in versions that include "prior"
-    template <typename octet_iterator>
-    uint32_t previous(octet_iterator& it, octet_iterator pass_start)
-    {
-        octet_iterator end = it;
-        while (internal::is_trail(*(--it))) 
-            if (it == pass_start)
-                throw invalid_utf8(*(it+1)); // error - no lead byte in the sequence
-        octet_iterator temp = it;
-        return next(temp, end);
-    }
-
     template <typename octet_iterator, typename distance_type>
     void advance (octet_iterator& it, distance_type n, octet_iterator end)
     {
@@ -205,6 +193,7 @@ namespace utf8
     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
     {       
         while (start != end) {
+            u16bit_iterator last = start;
             uint32_t cp = internal::mask16(*start++);
             // Take care of surrogate pairs first
             if (internal::is_surrogate(cp)) {
@@ -215,8 +204,8 @@ namespace utf8
                     else 
                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
                 }
-                else 
-                    throw invalid_utf16(static_cast<uint16_t>(*(start-1)));
+                else
+                    throw invalid_utf16(static_cast<uint16_t>(*last));
             
             }
             result = append(cp, result);
diff --git a/utf8/html.cpp b/utf8/html.cpp
new file mode 100644
index 0000000..0ecb308
--- /dev/null
+++ b/utf8/html.cpp
@@ -0,0 +1,380 @@
+#include "../common.h"
+#include "parser.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <iostream>
+#include <string>
+#include <map>
+
+typedef struct {
+	const char * Name;
+	unsigned int Char;
+	int UTF8Size;
+	unsigned long int UTF8;
+} HTMLEntityDataType;
+
+#define UTF8SIZE(Ch)  ( (Ch >= 0x800) ? 3 : ( (Ch >= 0x80) ? 2 : 1 ) )
+#define UTF8BYTE1(Ch) ( (Ch >= 0x800) ? (0xE0 | (Ch >> 12 & 0x0F)) : ( (Ch >= 0x80) ? (0xC0 | (Ch >> 6 & 0x1F)) : Ch ) )
+#define UTF8BYTE2(Ch) ( (Ch >= 0x800) ? (0x80 | (Ch >> 6 & 0x3F)) : ( (Ch >= 0x80) ? (0x80 | (Ch & 0x3F)) : 0 ) )
+#define UTF8BYTE3(Ch) ( (Ch >= 0x800) ? (0x80 | (Ch & 0x3F)) : ( (Ch >= 0x80) ? 0 : 0 ) )
+#define UTF8ENTITY(Name, Ch) { Name, Ch, UTF8SIZE(Ch), UTF8BYTE1(Ch) + (UTF8BYTE2(Ch) << 8) + (UTF8BYTE3(Ch) << 16) }
+
+static const HTMLEntityDataType HTMLEntityData[] =
+{ // List of entities defined in the HTML 4.0 spec
+	UTF8ENTITY("nbsp", 160),
+	UTF8ENTITY("iexcl", 161),
+	UTF8ENTITY("cent", 162),
+	UTF8ENTITY("pound", 163),
+	UTF8ENTITY("curren", 164),
+	UTF8ENTITY("yen", 165),
+	UTF8ENTITY("brvbar", 166),
+	UTF8ENTITY("sect", 167),
+	UTF8ENTITY("uml", 168),
+	UTF8ENTITY("copy", 169),
+	UTF8ENTITY("ordf", 170),
+	UTF8ENTITY("laquo", 171),
+	UTF8ENTITY("not", 172),
+	UTF8ENTITY("shy", 173),
+	UTF8ENTITY("reg", 174),
+	UTF8ENTITY("macr", 175),
+	UTF8ENTITY("deg", 176),
+	UTF8ENTITY("plusmn", 177),
+	UTF8ENTITY("sup2", 178),
+	UTF8ENTITY("sup3", 179),
+	UTF8ENTITY("acute", 180),
+	UTF8ENTITY("micro", 181),
+	UTF8ENTITY("para", 182),
+	UTF8ENTITY("middot", 183),
+	UTF8ENTITY("cedil", 184),
+	UTF8ENTITY("sup1", 185),
+	UTF8ENTITY("ordm", 186),
+	UTF8ENTITY("raquo", 187),
+	UTF8ENTITY("frac14", 188),
+	UTF8ENTITY("frac12", 189),
+	UTF8ENTITY("frac34", 190),
+	UTF8ENTITY("iquest", 191),
+	UTF8ENTITY("Agrave", 192),
+	UTF8ENTITY("Aacute", 193),
+	UTF8ENTITY("Acirc", 194),
+	UTF8ENTITY("Atilde", 195),
+	UTF8ENTITY("Auml", 196),
+	UTF8ENTITY("Aring", 197),
+	UTF8ENTITY("AElig", 198),
+	UTF8ENTITY("Ccedil", 199),
+	UTF8ENTITY("Egrave", 200),
+	UTF8ENTITY("Eacute", 201),
+	UTF8ENTITY("Ecirc", 202),
+	UTF8ENTITY("Euml", 203),
+	UTF8ENTITY("Igrave", 204),
+	UTF8ENTITY("Iacute", 205),
+	UTF8ENTITY("Icirc", 206),
+	UTF8ENTITY("Iuml", 207),
+	UTF8ENTITY("ETH", 208),
+	UTF8ENTITY("Ntilde", 209),
+	UTF8ENTITY("Ograve", 210),
+	UTF8ENTITY("Oacute", 211),
+	UTF8ENTITY("Ocirc", 212),
+	UTF8ENTITY("Otilde", 213),
+	UTF8ENTITY("Ouml", 214),
+	UTF8ENTITY("times", 215),
+	UTF8ENTITY("Oslash", 216),
+	UTF8ENTITY("Ugrave", 217),
+	UTF8ENTITY("Uacute", 218),
+	UTF8ENTITY("Ucirc", 219),
+	UTF8ENTITY("Uuml", 220),
+	UTF8ENTITY("Yacute", 221),
+	UTF8ENTITY("THORN", 222),
+	UTF8ENTITY("szlig", 223),
+	UTF8ENTITY("agrave", 224),
+	UTF8ENTITY("aacute", 225),
+	UTF8ENTITY("acirc", 226),
+	UTF8ENTITY("atilde", 227),
+	UTF8ENTITY("auml", 228),
+	UTF8ENTITY("aring", 229),
+	UTF8ENTITY("aelig", 230),
+	UTF8ENTITY("ccedil", 231),
+	UTF8ENTITY("egrave", 232),
+	UTF8ENTITY("eacute", 233),
+	UTF8ENTITY("ecirc", 234),
+	UTF8ENTITY("euml", 235),
+	UTF8ENTITY("igrave", 236),
+	UTF8ENTITY("iacute", 237),
+	UTF8ENTITY("icirc", 238),
+	UTF8ENTITY("iuml", 239),
+	UTF8ENTITY("eth", 240),
+	UTF8ENTITY("ntilde", 241),
+	UTF8ENTITY("ograve", 242),
+	UTF8ENTITY("oacute", 243),
+	UTF8ENTITY("ocirc", 244),
+	UTF8ENTITY("otilde", 245),
+	UTF8ENTITY("ouml", 246),
+	UTF8ENTITY("divide", 247),
+	UTF8ENTITY("oslash", 248),
+	UTF8ENTITY("ugrave", 249),
+	UTF8ENTITY("uacute", 250),
+	UTF8ENTITY("ucirc", 251),
+	UTF8ENTITY("uuml", 252),
+	UTF8ENTITY("yacute", 253),
+	UTF8ENTITY("thorn", 254),
+	UTF8ENTITY("yuml", 255),
+	UTF8ENTITY("fnof", 402),
+	// Greek
+	UTF8ENTITY("Alpha", 913),
+	UTF8ENTITY("Beta", 914),
+	UTF8ENTITY("Gamma", 915),
+	UTF8ENTITY("Delta", 916),
+	UTF8ENTITY("Epsilon", 917),
+	UTF8ENTITY("Zeta", 918),
+	UTF8ENTITY("Eta", 919),
+	UTF8ENTITY("Theta", 920),
+	UTF8ENTITY("Iota", 921),
+	UTF8ENTITY("Kappa", 922),
+	UTF8ENTITY("Lambda", 923),
+	UTF8ENTITY("Mu", 924),
+	UTF8ENTITY("Nu", 925),
+	UTF8ENTITY("Xi", 926),
+	UTF8ENTITY("Omicron", 927),
+	UTF8ENTITY("Pi", 928),
+	UTF8ENTITY("Rho", 929),
+	UTF8ENTITY("Sigma", 931),
+	UTF8ENTITY("Tau", 932),
+	UTF8ENTITY("Upsilon", 933),
+	UTF8ENTITY("Phi", 934),
+	UTF8ENTITY("Chi", 935),
+	UTF8ENTITY("Psi", 936),
+	UTF8ENTITY("Omega", 937),
+	UTF8ENTITY("alpha", 945),
+	UTF8ENTITY("beta", 946),
+	UTF8ENTITY("gamma", 947),
+	UTF8ENTITY("delta", 948),
+	UTF8ENTITY("epsilon", 949),
+	UTF8ENTITY("zeta", 950),
+	UTF8ENTITY("eta", 951),
+	UTF8ENTITY("theta", 952),
+	UTF8ENTITY("iota", 953),
+	UTF8ENTITY("kappa", 954),
+	UTF8ENTITY("lambda", 955),
+	UTF8ENTITY("mu", 956),
+	UTF8ENTITY("nu", 957),
+	UTF8ENTITY("xi", 958),
+	UTF8ENTITY("omicron", 959),
+	UTF8ENTITY("pi", 960),
+	UTF8ENTITY("rho", 961),
+	UTF8ENTITY("sigmaf", 962),
+	UTF8ENTITY("sigma", 963),
+	UTF8ENTITY("tau", 964),
+	UTF8ENTITY("upsilon", 965),
+	UTF8ENTITY("phi", 966),
+	UTF8ENTITY("chi", 967),
+	UTF8ENTITY("psi", 968),
+	UTF8ENTITY("omega", 969),
+	UTF8ENTITY("thetasym", 977),
+	UTF8ENTITY("upsih", 978),
+	UTF8ENTITY("piv", 982),
+	// General Punctuation
+	UTF8ENTITY("bull", 8226),
+	UTF8ENTITY("hellip", 8230),
+	UTF8ENTITY("prime", 8242),
+	UTF8ENTITY("Prime", 8243),
+	UTF8ENTITY("oline", 8254),
+	UTF8ENTITY("frasl", 8260),
+	// Letterlike Symbols
+	UTF8ENTITY("weierp", 8472),
+	UTF8ENTITY("image", 8465),
+	UTF8ENTITY("real", 8476),
+	UTF8ENTITY("trade", 8482),
+	UTF8ENTITY("alefsym", 8501),
+	// Arrows
+	UTF8ENTITY("larr", 8592),
+	UTF8ENTITY("uarr", 8593),
+	UTF8ENTITY("rarr", 8594),
+	UTF8ENTITY("darr", 8595),
+	UTF8ENTITY("harr", 8596),
+	UTF8ENTITY("crarr", 8629),
+	UTF8ENTITY("lArr", 8656),
+	UTF8ENTITY("uArr", 8657),
+	UTF8ENTITY("rArr", 8658),
+	UTF8ENTITY("dArr", 8659),
+	UTF8ENTITY("hArr", 8660),
+	// Mathematical Operators
+	UTF8ENTITY("forall", 8704),
+	UTF8ENTITY("part", 8706),
+	UTF8ENTITY("exist", 8707),
+	UTF8ENTITY("empty", 8709),
+	UTF8ENTITY("nabla", 8711),
+	UTF8ENTITY("isin", 8712),
+	UTF8ENTITY("notin", 8713),
+	UTF8ENTITY("ni", 8715),
+	UTF8ENTITY("prod", 8719),
+	UTF8ENTITY("sum", 8721),
+	UTF8ENTITY("minus", 8722),
+	UTF8ENTITY("lowast", 8727),
+	UTF8ENTITY("radic", 8730),
+	UTF8ENTITY("prop", 8733),
+	UTF8ENTITY("infin", 8734),
+	UTF8ENTITY("and", 8743),
+	UTF8ENTITY("or", 8744),
+	UTF8ENTITY("cap", 8745),
+	UTF8ENTITY("cup", 8746),
+	UTF8ENTITY("int", 8747),
+	UTF8ENTITY("there4", 8756),
+	UTF8ENTITY("sim", 8764),
+	UTF8ENTITY("cong", 8773),
+	UTF8ENTITY("asymp", 8776),
+	UTF8ENTITY("ne", 8800),
+	UTF8ENTITY("equiv", 8801),
+	UTF8ENTITY("le", 8804),
+	UTF8ENTITY("ge", 8805),
+	UTF8ENTITY("sub", 8834),
+	UTF8ENTITY("sup", 8835),
+	UTF8ENTITY("nsub", 8836),
+	UTF8ENTITY("sube", 8838),
+	UTF8ENTITY("supe", 8839),
+	UTF8ENTITY("oplus", 8853),
+	UTF8ENTITY("otimes", 8855),
+	UTF8ENTITY("perp", 8869),
+	UTF8ENTITY("sdot", 8901),
+	// Miscellaneous Technical
+	UTF8ENTITY("lceil", 8968),
+	UTF8ENTITY("rceil", 8969),
+	UTF8ENTITY("lfloor", 8970),
+	UTF8ENTITY("rfloor", 8971),
+	UTF8ENTITY("lang", 9001),
+	UTF8ENTITY("rang", 9002),
+	// Geometric Shapes
+	UTF8ENTITY("loz", 9674),
+	// Miscellaneous Symbols
+	UTF8ENTITY("spades", 9824),
+	UTF8ENTITY("clubs", 9827),
+	UTF8ENTITY("hearts", 9829),
+	UTF8ENTITY("diams", 9830),
+	UTF8ENTITY("quot", 34),
+	UTF8ENTITY("amp", 38),
+	UTF8ENTITY("lt", 60),
+	UTF8ENTITY("gt", 62),
+	// Latin Extended-A
+	UTF8ENTITY("OElig", 338),
+	UTF8ENTITY("oelig", 339),
+	UTF8ENTITY("Scaron", 352),
+	UTF8ENTITY("scaron", 353),
+	UTF8ENTITY("Yuml", 376),
+	// Spacing Modifier Letters
+	UTF8ENTITY("circ", 710),
+	UTF8ENTITY("tilde", 732),
+	// General Punctuation
+	UTF8ENTITY("ensp", 8194),
+	UTF8ENTITY("emsp", 8195),
+	UTF8ENTITY("thinsp", 8201),
+	UTF8ENTITY("zwnj", 8204),
+	UTF8ENTITY("zwj", 8205),
+	UTF8ENTITY("lrm", 8206),
+	UTF8ENTITY("rlm", 8207),
+	UTF8ENTITY("ndash", 8211),
+	UTF8ENTITY("mdash", 8212),
+	UTF8ENTITY("lsquo", 8216),
+	UTF8ENTITY("rsquo", 8217),
+	UTF8ENTITY("sbquo", 8218),
+	UTF8ENTITY("ldquo", 8220),
+	UTF8ENTITY("rdquo", 8221),
+	UTF8ENTITY("bdquo", 8222),
+	UTF8ENTITY("dagger", 8224),
+	UTF8ENTITY("Dagger", 8225),
+	UTF8ENTITY("permil", 8240),
+	UTF8ENTITY("lsaquo", 8249),
+	UTF8ENTITY("rsaquo", 8250),
+	UTF8ENTITY("euro", 8364),
+	UTF8ENTITY(NULL, 0) // End of the list
+}; // End of HTMLEntityData
+
+class Char2UTF8Map : std::map<unsigned long int, std::string>
+{
+public:
+	typedef std::map<unsigned long int, std::string>::iterator iterator;
+	typedef std::pair<unsigned long int, std::string> pair;
+	Char2UTF8Map(const HTMLEntityDataType *entities)
+	{
+		const HTMLEntityDataType *ent = HTMLEntityData;
+		while (ent->Name)
+		{
+			insert ( pair(ent->Char,ent->Name) );
+			ent++;
+		}
+	}
+	inline iterator end()
+	{
+		return std::map<unsigned long int, std::string>::end();
+	}
+	inline iterator find(unsigned long int ch)
+	{
+		return std::map<unsigned long int, std::string>::find(ch);
+	}
+};
+
+static Char2UTF8Map char2utf8(HTMLEntityData);
+
+void QuoteHTML(std::ostream &out, const char *in)
+{
+	const char *ptr = in;
+	int len = strlen(in);
+	while (*ptr)
+	{
+		unsigned long int ch;
+		int adv = utf8::parser::utf8_to_char(&ch, ptr, len);
+		if (adv && len > adv)
+		{
+			Char2UTF8Map::iterator it = char2utf8.find(ch);
+			if (ch < 128)
+			{
+				out.put(*ptr);
+			}
+			else if (it != char2utf8.end())
+			{
+				out << "&" << it->second << ";";
+			}
+			else
+			{
+				out << "&#" << ch << ";";
+			}
+			len -= adv;
+			ptr += adv;
+		}
+		else
+			break;
+	}
+}
+
+
+#ifdef UNIT_TEST
+
+static uint32_t Ch2UTF8(unsigned int ch) // Writes ch in UTF-8 encoding. Only 16 bits
+{
+	if (ch >= 0x800)
+	{
+		return (0xE0 | (ch >> 12 & 0x0F)) + (0x80 | (ch >> 6 & 0x3F)) * 256 + (0x80 | (ch & 0x3F)) * 65536;
+	}
+	else if (ch >= 0x80)
+	{
+		return (0xC0 | (ch >> 6 & 0x1F)) + (0x80 | (ch & 0x3F)) * 256;
+	}
+	else
+		return ch;
+}
+
+TEST_FUNCTION TestCuUTF8HTML(CuTest* tc)
+{
+	const HTMLEntityDataType *ent = HTMLEntityData;
+	while (ent->Name)
+	{
+		uint32_t utf8 = Ch2UTF8(ent->Char);
+		CuAssertTrue(tc, memcmp(&ent->UTF8, &utf8, 3) == 0 );
+		ent++;
+	}
+	QuoteHTML(std::cout, "Hou>stño€n\n");
+	std::cout << std::endl;
+
+}
+#endif
+
diff --git a/utf8/parser.cpp b/utf8/parser.cpp
new file mode 100644
index 0000000..427c214
--- /dev/null
+++ b/utf8/parser.cpp
@@ -0,0 +1,788 @@
+/* 
+ * helper_utf8.c - Raptor UTF-8 and Unicode support
+ *
+ * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
+ * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
+ * Copyright (C) 2009  Miriam Ruiz <little_miry at yahoo.es>
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ *   2. GNU General Public License (GPL) V2 or any newer version
+ *   3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ */
+
+#include "../common.h"
+#include "parser.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+namespace utf8 { namespace parser {
+
+/**
+ * char_to_utf8:
+ * @c: Unicode character
+ * @output: UTF-8 string buffer or NULL
+ *
+ * Convert a Unicode character to UTF-8 encoding.
+ *
+ * Based on librdf_unicode_char_to_utf8() with no need to calculate
+ * length since the encoded character is always copied into a buffer
+ * with sufficient size.
+ *
+ * Return value: bytes encoded to output buffer or <0 on failure
+ **/
+int char_to_utf8(unsigned long c, char *output)
+{
+	int size=0;
+
+	if (c < 0x00000080)
+		size=1;
+	else if (c < 0x00000800)
+		size=2;
+	else if (c < 0x00010000)
+		size=3;
+	else if (c < 0x00200000)
+		size=4;
+	else if (c < 0x04000000)
+		size=5;
+	else if (c < 0x80000000)
+		size=6;
+	else
+		return -1;
+
+	switch(size)
+	{
+		case 6:
+			output[5]=0x80 | (unsigned char)(c & 0x3F);
+			c= c >> 6;
+			/* set bit 2 (bits 7,6,5,4,3,2 less 7,6,5,4,3 set below) on last byte */
+			c |= 0x4000000;		 /* 0x10000 = 0x04 << 24 */
+			/* FALLTHROUGH */
+		case 5:
+			output[4]=0x80 | (unsigned char)(c & 0x3F);
+			c= c >> 6;
+			/* set bit 3 (bits 7,6,5,4,3 less 7,6,5,4 set below) on last byte */
+			c |= 0x200000;		 /* 0x10000 = 0x08 << 18 */
+			/* FALLTHROUGH */
+		case 4:
+			output[3]=0x80 | (unsigned char)(c & 0x3F);
+			c= c >> 6;
+			/* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
+			c |= 0x10000;		 /* 0x10000 = 0x10 << 12 */
+			/* FALLTHROUGH */
+		case 3:
+			output[2]=0x80 | (unsigned char)(c & 0x3F);
+			c= c >> 6;
+			/* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
+			c |= 0x800;			 /* 0x800 = 0x20 << 6 */
+			/* FALLTHROUGH */
+		case 2:
+			output[1]=0x80 | (unsigned char)(c & 0x3F);
+			c= c >> 6;
+			/* set bits 7,6 on last byte */
+			c |= 0xc0;
+			/* FALLTHROUGH */
+		case 1:
+			output[0]=(unsigned char)c;
+	}
+
+	return size;
+}
+
+
+/**
+ * helper_utf8_to_unicode_char:
+ * @output: Pointer to the Unicode character or NULL
+ * @input: UTF-8 string buffer
+ * @length: buffer size
+ *
+ * Convert an UTF-8 encoded buffer to a Unicode character.
+ *
+ * If output is NULL, then will calculate the number of bytes that
+ * will be used from the input buffer and not perform the conversion.
+ *
+ * Return value: bytes used from input buffer or <0 on failure:
+ *  -1 input buffer too short or length error
+ *  -2 overlong UTF-8 sequence
+ *  -3 illegal code positions
+ *  -4 code out of range U+0000 to U+10FFFF.
+ *  In cases -2, -3 and -4 the coded character is stored in the output.
+ */
+int utf8_to_char(unsigned long *output, const char *input, int length)
+{
+	unsigned char in;
+	int size;
+	unsigned long c=0;
+
+	if(length < 1)
+		return -1;
+
+	in=*input++;
+	if((in & 0x80) == 0)
+	{
+		size=1;
+		c= in & 0x7f;
+	}
+	else if((in & 0xe0) == 0xc0)
+	{
+		size=2;
+		c= in & 0x1f;
+	}
+	else if((in & 0xf0) == 0xe0)
+	{
+		size=3;
+		c= in & 0x0f;
+	}
+	else if((in & 0xf8) == 0xf0)
+	{
+		size=4;
+		c = in & 0x07;
+	}
+	else if((in & 0xfc) == 0xf8)
+	{
+		size=5;
+		c = in & 0x03;
+	}
+	else if((in & 0xfe) == 0xfc)
+	{
+		size=6;
+		c = in & 0x01;
+	} else
+	return -1;
+
+	if(!output)
+		return size;
+
+	if(length < size)
+		return -1;
+
+	switch(size)
+	{
+		case 6:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		case 5:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		case 4:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		case 3:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		case 2:
+			in=*input++ & 0x3f;
+			c= c << 6;
+			c |= in;
+			/* FALLTHROUGH */
+		default:
+			break;
+	}
+
+	*output=c;
+
+	/* check for overlong UTF-8 sequences */
+	switch(size)
+	{
+		case 2:
+			if(c < 0x00000080)
+				return -2;
+			break;
+		case 3:
+			if(c < 0x00000800)
+				return -2;
+			break;
+		case 4:
+			if(c < 0x00010000)
+				return -2;
+			break;
+
+		default: // 1
+			break;
+	}
+
+	/* check for illegal code positions:
+	 * U+D800 to U+DFFF (UTF-16 surrogates)
+	 * U+FFFE and U+FFFF
+	 */
+	if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
+		return -3;
+
+	/* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
+	/* of course this makes some 4 byte forms illegal */
+	if(c > 0x10ffff)
+		return -4;
+
+	return size;
+}
+
+
+/**
+ * is_xml11_namestartchar:
+ * @c: Unicode character to check
+ *
+ * Check if Unicode character is legal to start an XML 1.1 Name
+ *
+ * Namespaces in XML 1.1 REC 2004-02-04
+ *   http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
+ * updating
+ *   Extensible Markup Language (XML) 1.1 REC 2004-02-04
+ *   http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int is_xml11_namestartchar(long c)
+{
+								 /* [A-Z] */
+	return (((c >= 0x0041)  && (c <= 0x005A)) ||
+		(c == 0x005F) ||		 /* '_' */
+								 /* [a-z] */
+		((c >= 0x0061)  && (c <= 0x007A)) ||
+		((c >= 0x00C0)  && (c <= 0x00D6)) ||
+		((c >= 0x00D8)  && (c <= 0x00F6)) ||
+		((c >= 0x00F8)  && (c <= 0x02FF)) ||
+		((c >= 0x0370)  && (c <= 0x037D)) ||
+		((c >= 0x037F)  && (c <= 0x1FFF)) ||
+		((c >= 0x200C)  && (c <= 0x200D)) ||
+		((c >= 0x2070)  && (c <= 0x218F)) ||
+		((c >= 0x2C00)  && (c <= 0x2FEF)) ||
+		((c >= 0x3001)  && (c <= 0xD7FF)) ||
+		((c >= 0xF900)  && (c <= 0xFDCF)) ||
+		((c >= 0xFDF0)  && (c <= 0xFFFD)) ||
+		((c >= 0x10000) && (c <= 0xEFFFF)));
+}
+
+
+/**
+ * is_xml10_namestartchar:
+ * @c: Unicode character to check
+ *
+ * Check if Unicode character is legal to start an XML 1.0 Name
+ *
+ * Namespaces in XML REC 1999-01-14
+ *   http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
+ * updating
+ *   Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
+ *   http://www.w3.org/TR/2004/REC-xml-20040204/
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int is_xml10_namestartchar(long c)
+{
+	return (is_letter(c) ||
+		(c == '_'));
+}
+
+
+/**
+ * is_xml11_namechar:
+ * @c: Unicode character
+ *
+ * Check if a Unicode codepoint is a legal to continue an XML 1.1 Name
+ *
+ * Namespaces in XML 1.1 REC 2004-02-04
+ *   http://www.w3.org/TR/2004/REC-xml11-20040204/
+ * updating
+ *   Extensible Markup Language (XML) 1.1 REC 2004-02-04
+ *   http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int is_xml11_namechar(long c)
+{
+	return (is_xml11_namestartchar(c) ||
+		(c == 0x002D) ||		 /* '-' */
+		(c == 0x002E) ||		 /* '.' */
+								 /* 0-9 */
+		(c >= 0x0030 && c <= 0x0039) ||
+		(c == 0x00B7) ||
+		(c >= 0x0300 && c <=0x036F) ||
+		(c >= 0x203F && c <=0x2040));
+}
+
+
+/**
+ * is_xml10_namechar:
+ * @c: Unicode character
+ *
+ * Check if a Unicode codepoint is a legal to continue an XML 1.0 Name
+ *
+ * Namespaces in XML REC 1999-01-14
+ *   http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCNameChar
+ * updating
+ *   Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
+ *   http://www.w3.org/TR/2004/REC-xml-20040204/
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int is_xml10_namechar(long c)
+{
+	return (is_letter(c) ||
+		is_digit(c) ||
+		(c == 0x002E) ||		 /* '.' */
+		(c == 0x002D) ||		 /* '-' */
+		(c == 0x005F) ||		 /* '_' */
+		is_combiningchar(c) ||
+		is_extender(c));
+}
+
+
+/*
+ * All this below was derived by machine-transforming the classes in Appendix B
+ * of http://www.w3.org/TR/2000/REC-xml-20001006
+ */
+
+int is_letter(long c)
+{
+	return(is_basechar(c) ||
+		is_ideographic(c));
+}
+
+
+int is_basechar(long c)
+{
+	/* http://www.w3.org/TR/2000/REC-xml-20001006#NT-BaseChar */
+	return((c >= 0x0041 && c <= 0x005A ) ||
+		(c >= 0x0061 && c <= 0x007A ) ||
+		(c >= 0x00C0 && c <= 0x00D6 ) ||
+		(c >= 0x00D8 && c <= 0x00F6 ) ||
+		(c >= 0x00F8 && c <= 0x00FF ) ||
+		(c >= 0x0100 && c <= 0x0131 ) ||
+		(c >= 0x0134 && c <= 0x013E ) ||
+		(c >= 0x0141 && c <= 0x0148 ) ||
+		(c >= 0x014A && c <= 0x017E ) ||
+		(c >= 0x0180 && c <= 0x01C3 ) ||
+		(c >= 0x01CD && c <= 0x01F0 ) ||
+		(c >= 0x01F4 && c <= 0x01F5 ) ||
+		(c >= 0x01FA && c <= 0x0217 ) ||
+		(c >= 0x0250 && c <= 0x02A8 ) ||
+		(c >= 0x02BB && c <= 0x02C1 ) ||
+		(c == 0x0386) ||
+		(c >= 0x0388 && c <= 0x038A ) ||
+		(c == 0x038C) ||
+		(c >= 0x038E && c <= 0x03A1 ) ||
+		(c >= 0x03A3 && c <= 0x03CE ) ||
+		(c >= 0x03D0 && c <= 0x03D6 ) ||
+		(c == 0x03DA) ||
+		(c == 0x03DC) ||
+		(c == 0x03DE) ||
+		(c == 0x03E0) ||
+		(c >= 0x03E2 && c <= 0x03F3 ) ||
+		(c >= 0x0401 && c <= 0x040C ) ||
+		(c >= 0x040E && c <= 0x044F ) ||
+		(c >= 0x0451 && c <= 0x045C ) ||
+		(c >= 0x045E && c <= 0x0481 ) ||
+		(c >= 0x0490 && c <= 0x04C4 ) ||
+		(c >= 0x04C7 && c <= 0x04C8 ) ||
+		(c >= 0x04CB && c <= 0x04CC ) ||
+		(c >= 0x04D0 && c <= 0x04EB ) ||
+		(c >= 0x04EE && c <= 0x04F5 ) ||
+		(c >= 0x04F8 && c <= 0x04F9 ) ||
+		(c >= 0x0531 && c <= 0x0556 ) ||
+		(c == 0x0559) ||
+		(c >= 0x0561 && c <= 0x0586 ) ||
+		(c >= 0x05D0 && c <= 0x05EA ) ||
+		(c >= 0x05F0 && c <= 0x05F2 ) ||
+		(c >= 0x0621 && c <= 0x063A ) ||
+		(c >= 0x0641 && c <= 0x064A ) ||
+		(c >= 0x0671 && c <= 0x06B7 ) ||
+		(c >= 0x06BA && c <= 0x06BE ) ||
+		(c >= 0x06C0 && c <= 0x06CE ) ||
+		(c >= 0x06D0 && c <= 0x06D3 ) ||
+		(c == 0x06D5) ||
+		(c >= 0x06E5 && c <= 0x06E6 ) ||
+		(c >= 0x0905 && c <= 0x0939 ) ||
+		(c == 0x093D) ||
+		(c >= 0x0958 && c <= 0x0961 ) ||
+		(c >= 0x0985 && c <= 0x098C ) ||
+		(c >= 0x098F && c <= 0x0990 ) ||
+		(c >= 0x0993 && c <= 0x09A8 ) ||
+		(c >= 0x09AA && c <= 0x09B0 ) ||
+		(c == 0x09B2) ||
+		(c >= 0x09B6 && c <= 0x09B9 ) ||
+		(c >= 0x09DC && c <= 0x09DD ) ||
+		(c >= 0x09DF && c <= 0x09E1 ) ||
+		(c >= 0x09F0 && c <= 0x09F1 ) ||
+		(c >= 0x0A05 && c <= 0x0A0A ) ||
+		(c >= 0x0A0F && c <= 0x0A10 ) ||
+		(c >= 0x0A13 && c <= 0x0A28 ) ||
+		(c >= 0x0A2A && c <= 0x0A30 ) ||
+		(c >= 0x0A32 && c <= 0x0A33 ) ||
+		(c >= 0x0A35 && c <= 0x0A36 ) ||
+		(c >= 0x0A38 && c <= 0x0A39 ) ||
+		(c >= 0x0A59 && c <= 0x0A5C ) ||
+		(c == 0x0A5E) ||
+		(c >= 0x0A72 && c <= 0x0A74 ) ||
+		(c >= 0x0A85 && c <= 0x0A8B ) ||
+		(c == 0x0A8D) ||
+		(c >= 0x0A8F && c <= 0x0A91 ) ||
+		(c >= 0x0A93 && c <= 0x0AA8 ) ||
+		(c >= 0x0AAA && c <= 0x0AB0 ) ||
+		(c >= 0x0AB2 && c <= 0x0AB3 ) ||
+		(c >= 0x0AB5 && c <= 0x0AB9 ) ||
+		(c == 0x0ABD) ||
+		(c == 0x0AE0) ||
+		(c >= 0x0B05 && c <= 0x0B0C ) ||
+		(c >= 0x0B0F && c <= 0x0B10 ) ||
+		(c >= 0x0B13 && c <= 0x0B28 ) ||
+		(c >= 0x0B2A && c <= 0x0B30 ) ||
+		(c >= 0x0B32 && c <= 0x0B33 ) ||
+		(c >= 0x0B36 && c <= 0x0B39 ) ||
+		(c == 0x0B3D) ||
+		(c >= 0x0B5C && c <= 0x0B5D ) ||
+		(c >= 0x0B5F && c <= 0x0B61 ) ||
+		(c >= 0x0B85 && c <= 0x0B8A ) ||
+		(c >= 0x0B8E && c <= 0x0B90 ) ||
+		(c >= 0x0B92 && c <= 0x0B95 ) ||
+		(c >= 0x0B99 && c <= 0x0B9A ) ||
+		(c == 0x0B9C) ||
+		(c >= 0x0B9E && c <= 0x0B9F ) ||
+		(c >= 0x0BA3 && c <= 0x0BA4 ) ||
+		(c >= 0x0BA8 && c <= 0x0BAA ) ||
+		(c >= 0x0BAE && c <= 0x0BB5 ) ||
+		(c >= 0x0BB7 && c <= 0x0BB9 ) ||
+		(c >= 0x0C05 && c <= 0x0C0C ) ||
+		(c >= 0x0C0E && c <= 0x0C10 ) ||
+		(c >= 0x0C12 && c <= 0x0C28 ) ||
+		(c >= 0x0C2A && c <= 0x0C33 ) ||
+		(c >= 0x0C35 && c <= 0x0C39 ) ||
+		(c >= 0x0C60 && c <= 0x0C61 ) ||
+		(c >= 0x0C85 && c <= 0x0C8C ) ||
+		(c >= 0x0C8E && c <= 0x0C90 ) ||
+		(c >= 0x0C92 && c <= 0x0CA8 ) ||
+		(c >= 0x0CAA && c <= 0x0CB3 ) ||
+		(c >= 0x0CB5 && c <= 0x0CB9 ) ||
+		(c == 0x0CDE) ||
+		(c >= 0x0CE0 && c <= 0x0CE1 ) ||
+		(c >= 0x0D05 && c <= 0x0D0C ) ||
+		(c >= 0x0D0E && c <= 0x0D10 ) ||
+		(c >= 0x0D12 && c <= 0x0D28 ) ||
+		(c >= 0x0D2A && c <= 0x0D39 ) ||
+		(c >= 0x0D60 && c <= 0x0D61 ) ||
+		(c >= 0x0E01 && c <= 0x0E2E ) ||
+		(c == 0x0E30) ||
+		(c >= 0x0E32 && c <= 0x0E33 ) ||
+		(c >= 0x0E40 && c <= 0x0E45 ) ||
+		(c >= 0x0E81 && c <= 0x0E82 ) ||
+		(c == 0x0E84) ||
+		(c >= 0x0E87 && c <= 0x0E88 ) ||
+		(c == 0x0E8A) ||
+		(c == 0x0E8D) ||
+		(c >= 0x0E94 && c <= 0x0E97 ) ||
+		(c >= 0x0E99 && c <= 0x0E9F ) ||
+		(c >= 0x0EA1 && c <= 0x0EA3 ) ||
+		(c == 0x0EA5) ||
+		(c == 0x0EA7) ||
+		(c >= 0x0EAA && c <= 0x0EAB ) ||
+		(c >= 0x0EAD && c <= 0x0EAE ) ||
+		(c == 0x0EB0) ||
+		(c >= 0x0EB2 && c <= 0x0EB3 ) ||
+		(c == 0x0EBD) ||
+		(c >= 0x0EC0 && c <= 0x0EC4 ) ||
+		(c >= 0x0F40 && c <= 0x0F47 ) ||
+		(c >= 0x0F49 && c <= 0x0F69 ) ||
+		(c >= 0x10A0 && c <= 0x10C5 ) ||
+		(c >= 0x10D0 && c <= 0x10F6 ) ||
+		(c == 0x1100) ||
+		(c >= 0x1102 && c <= 0x1103 ) ||
+		(c >= 0x1105 && c <= 0x1107 ) ||
+		(c == 0x1109) ||
+		(c >= 0x110B && c <= 0x110C ) ||
+		(c >= 0x110E && c <= 0x1112 ) ||
+		(c == 0x113C) ||
+		(c == 0x113E) ||
+		(c == 0x1140) ||
+		(c == 0x114C) ||
+		(c == 0x114E) ||
+		(c == 0x1150) ||
+		(c >= 0x1154 && c <= 0x1155 ) ||
+		(c == 0x1159) ||
+		(c >= 0x115F && c <= 0x1161 ) ||
+		(c == 0x1163) ||
+		(c == 0x1165) ||
+		(c == 0x1167) ||
+		(c == 0x1169) ||
+		(c >= 0x116D && c <= 0x116E ) ||
+		(c >= 0x1172 && c <= 0x1173 ) ||
+		(c == 0x1175) ||
+		(c == 0x119E) ||
+		(c == 0x11A8) ||
+		(c == 0x11AB) ||
+		(c >= 0x11AE && c <= 0x11AF ) ||
+		(c >= 0x11B7 && c <= 0x11B8 ) ||
+		(c == 0x11BA) ||
+		(c >= 0x11BC && c <= 0x11C2 ) ||
+		(c == 0x11EB) ||
+		(c == 0x11F0) ||
+		(c == 0x11F9) ||
+		(c >= 0x1E00 && c <= 0x1E9B ) ||
+		(c >= 0x1EA0 && c <= 0x1EF9 ) ||
+		(c >= 0x1F00 && c <= 0x1F15 ) ||
+		(c >= 0x1F18 && c <= 0x1F1D ) ||
+		(c >= 0x1F20 && c <= 0x1F45 ) ||
+		(c >= 0x1F48 && c <= 0x1F4D ) ||
+		(c >= 0x1F50 && c <= 0x1F57 ) ||
+		(c == 0x1F59) ||
+		(c == 0x1F5B) ||
+		(c == 0x1F5D) ||
+		(c >= 0x1F5F && c <= 0x1F7D ) ||
+		(c >= 0x1F80 && c <= 0x1FB4 ) ||
+		(c >= 0x1FB6 && c <= 0x1FBC ) ||
+		(c == 0x1FBE) ||
+		(c >= 0x1FC2 && c <= 0x1FC4 ) ||
+		(c >= 0x1FC6 && c <= 0x1FCC ) ||
+		(c >= 0x1FD0 && c <= 0x1FD3 ) ||
+		(c >= 0x1FD6 && c <= 0x1FDB ) ||
+		(c >= 0x1FE0 && c <= 0x1FEC ) ||
+		(c >= 0x1FF2 && c <= 0x1FF4 ) ||
+		(c >= 0x1FF6 && c <= 0x1FFC ) ||
+		(c == 0x2126) ||
+		(c >= 0x212A && c <= 0x212B ) ||
+		(c == 0x212E) ||
+		(c >= 0x2180 && c <= 0x2182 ) ||
+		(c >= 0x3041 && c <= 0x3094 ) ||
+		(c >= 0x30A1 && c <= 0x30FA ) ||
+		(c >= 0x3105 && c <= 0x312C ) ||
+		(c >= 0xAC00 && c <= 0xD7A3 )
+		);
+}
+
+
+int is_ideographic(long c)
+{
+	/* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Ideographic */
+	return((c >= 0x4E00 && c <= 0x9FA5 ) ||
+		(c == 0x3007) ||
+		(c >= 0x3021 && c <= 0x3029 ));
+}
+
+
+int is_combiningchar(long c)
+{
+	/* http://www.w3.org/TR/2000/REC-xml-20001006#NT-CombiningChar */
+	return((c >= 0x0300 && c <= 0x0345 ) ||
+		(c >= 0x0360 && c <= 0x0361 ) ||
+		(c >= 0x0483 && c <= 0x0486 ) ||
+		(c >= 0x0591 && c <= 0x05A1 ) ||
+		(c >= 0x05A3 && c <= 0x05B9 ) ||
+		(c >= 0x05BB && c <= 0x05BD ) ||
+		(c == 0x05BF) ||
+		(c >= 0x05C1 && c <= 0x05C2 ) ||
+		(c == 0x05C4) ||
+		(c >= 0x064B && c <= 0x0652 ) ||
+		(c == 0x0670) ||
+		(c >= 0x06D6 && c <= 0x06DC ) ||
+		(c >= 0x06DD && c <= 0x06DF ) ||
+		(c >= 0x06E0 && c <= 0x06E4 ) ||
+		(c >= 0x06E7 && c <= 0x06E8 ) ||
+		(c >= 0x06EA && c <= 0x06ED ) ||
+		(c >= 0x0901 && c <= 0x0903 ) ||
+		(c == 0x093C) ||
+		(c >= 0x093E && c <= 0x094C ) ||
+		(c == 0x094D) ||
+		(c >= 0x0951 && c <= 0x0954 ) ||
+		(c >= 0x0962 && c <= 0x0963 ) ||
+		(c >= 0x0981 && c <= 0x0983 ) ||
+		(c == 0x09BC) ||
+		(c == 0x09BE) ||
+		(c == 0x09BF) ||
+		(c >= 0x09C0 && c <= 0x09C4 ) ||
+		(c >= 0x09C7 && c <= 0x09C8 ) ||
+		(c >= 0x09CB && c <= 0x09CD ) ||
+		(c == 0x09D7) ||
+		(c >= 0x09E2 && c <= 0x09E3 ) ||
+		(c == 0x0A02) ||
+		(c == 0x0A3C) ||
+		(c == 0x0A3E) ||
+		(c == 0x0A3F) ||
+		(c >= 0x0A40 && c <= 0x0A42 ) ||
+		(c >= 0x0A47 && c <= 0x0A48 ) ||
+		(c >= 0x0A4B && c <= 0x0A4D ) ||
+		(c >= 0x0A70 && c <= 0x0A71 ) ||
+		(c >= 0x0A81 && c <= 0x0A83 ) ||
+		(c == 0x0ABC) ||
+		(c >= 0x0ABE && c <= 0x0AC5 ) ||
+		(c >= 0x0AC7 && c <= 0x0AC9 ) ||
+		(c >= 0x0ACB && c <= 0x0ACD ) ||
+		(c >= 0x0B01 && c <= 0x0B03 ) ||
+		(c == 0x0B3C) ||
+		(c >= 0x0B3E && c <= 0x0B43 ) ||
+		(c >= 0x0B47 && c <= 0x0B48 ) ||
+		(c >= 0x0B4B && c <= 0x0B4D ) ||
+		(c >= 0x0B56 && c <= 0x0B57 ) ||
+		(c >= 0x0B82 && c <= 0x0B83 ) ||
+		(c >= 0x0BBE && c <= 0x0BC2 ) ||
+		(c >= 0x0BC6 && c <= 0x0BC8 ) ||
+		(c >= 0x0BCA && c <= 0x0BCD ) ||
+		(c == 0x0BD7) ||
+		(c >= 0x0C01 && c <= 0x0C03 ) ||
+		(c >= 0x0C3E && c <= 0x0C44 ) ||
+		(c >= 0x0C46 && c <= 0x0C48 ) ||
+		(c >= 0x0C4A && c <= 0x0C4D ) ||
+		(c >= 0x0C55 && c <= 0x0C56 ) ||
+		(c >= 0x0C82 && c <= 0x0C83 ) ||
+		(c >= 0x0CBE && c <= 0x0CC4 ) ||
+		(c >= 0x0CC6 && c <= 0x0CC8 ) ||
+		(c >= 0x0CCA && c <= 0x0CCD ) ||
+		(c >= 0x0CD5 && c <= 0x0CD6 ) ||
+		(c >= 0x0D02 && c <= 0x0D03 ) ||
+		(c >= 0x0D3E && c <= 0x0D43 ) ||
+		(c >= 0x0D46 && c <= 0x0D48 ) ||
+		(c >= 0x0D4A && c <= 0x0D4D ) ||
+		(c == 0x0D57) ||
+		(c == 0x0E31) ||
+		(c >= 0x0E34 && c <= 0x0E3A ) ||
+		(c >= 0x0E47 && c <= 0x0E4E ) ||
+		(c == 0x0EB1) ||
+		(c >= 0x0EB4 && c <= 0x0EB9 ) ||
+		(c >= 0x0EBB && c <= 0x0EBC ) ||
+		(c >= 0x0EC8 && c <= 0x0ECD ) ||
+		(c >= 0x0F18 && c <= 0x0F19 ) ||
+		(c == 0x0F35) ||
+		(c == 0x0F37) ||
+		(c == 0x0F39) ||
+		(c == 0x0F3E) ||
+		(c == 0x0F3F) ||
+		(c >= 0x0F71 && c <= 0x0F84 ) ||
+		(c >= 0x0F86 && c <= 0x0F8B ) ||
+		(c >= 0x0F90 && c <= 0x0F95 ) ||
+		(c == 0x0F97) ||
+		(c >= 0x0F99 && c <= 0x0FAD ) ||
+		(c >= 0x0FB1 && c <= 0x0FB7 ) ||
+		(c == 0x0FB9) ||
+		(c >= 0x20D0 && c <= 0x20DC ) ||
+		(c == 0x20E1) ||
+		(c >= 0x302A && c <= 0x302F ) ||
+		(c == 0x3099) ||
+		(c == 0x309A));
+}
+
+
+int is_digit(long c)
+{
+	/* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
+	return((c >= 0x0030 && c <= 0x0039 ) ||
+		(c >= 0x0660 && c <= 0x0669 ) ||
+		(c >= 0x06F0 && c <= 0x06F9 ) ||
+		(c >= 0x0966 && c <= 0x096F ) ||
+		(c >= 0x09E6 && c <= 0x09EF ) ||
+		(c >= 0x0A66 && c <= 0x0A6F ) ||
+		(c >= 0x0AE6 && c <= 0x0AEF ) ||
+		(c >= 0x0B66 && c <= 0x0B6F ) ||
+		(c >= 0x0BE7 && c <= 0x0BEF ) ||
+		(c >= 0x0C66 && c <= 0x0C6F ) ||
+		(c >= 0x0CE6 && c <= 0x0CEF ) ||
+		(c >= 0x0D66 && c <= 0x0D6F ) ||
+		(c >= 0x0E50 && c <= 0x0E59 ) ||
+		(c >= 0x0ED0 && c <= 0x0ED9 ) ||
+		(c >= 0x0F20 && c <= 0x0F29 ));
+}
+
+
+int is_extender(long c)
+{
+	/* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Extender */
+	return((c == 0x00B7) ||
+		(c == 0x02D0) ||
+		(c == 0x02D1) ||
+		(c == 0x0387) ||
+		(c == 0x0640) ||
+		(c == 0x0E46) ||
+		(c == 0x0EC6) ||
+		(c == 0x3005) ||
+		(c >= 0x3031 && c <= 0x3035 ) ||
+		(c >= 0x309D && c <= 0x309E ) ||
+		(c >= 0x30FC && c <= 0x30FE ));
+}
+
+
+int is_space(long c)
+{
+	return((c == 0x0020) ||  // Space
+		(c == 0x000C) || // Page jump: \f
+		(c == 0x000D) || // Carriage return: \r
+		(c == 0x000A) || // Next line: \n
+		(c == 0x0009) || // Horizontal tab: \t
+		(c == 0x000B) ); // Vertical tab \v
+}
+
+
+/**
+ * helper_utf8_is_nfc:
+ * @input: UTF-8 string
+ * @length: length of string
+ *
+ * Check a string is in Unicode Normal Form C.
+ *
+ * Return value: Non 0 if the string is NFC
+ **/
+int is_nfc(const char *input, size_t length)
+{
+	unsigned int i;
+	int plain=1;
+
+	for(i=0; i<length; i++)
+		if(input[i]>0x7f)
+	{
+		plain=0;
+		break;
+	}
+
+	if(plain)
+		return 1;
+
+	#ifdef helper_NFC_CHECK
+	return helper_nfc_check(input, length, NULL);
+	#else
+	return 1;
+	#endif
+}
+
+
+/**
+ * helper_utf8_check:
+ * @string: UTF-8 string
+ * @length: length of string
+ *
+ * Check a string is UTF-8.
+ *
+ * Return value: Non 0 if the string is UTF-8
+ **/
+int check(const char *string, size_t length)
+{
+	while(length > 0)
+	{
+		unsigned long unichar=0;
+
+		int unichar_len = utf8_to_char(&unichar, string, length);
+		if(unichar_len < 0 || unichar_len > (int)length)
+			return 0;
+
+		if(unichar > 0x10ffff)
+			return 0;
+
+		string += unichar_len;
+		length -= unichar_len;
+	}
+	return 1;
+}
+
+} } // Close namespaces
diff --git a/utf8/parser.h b/utf8/parser.h
new file mode 100644
index 0000000..19d0153
--- /dev/null
+++ b/utf8/parser.h
@@ -0,0 +1,51 @@
+/* 
+ * helper_utf8.c - Raptor UTF-8 and Unicode support
+ *
+ * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
+ * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
+ * Copyright (C) 2009  Miriam Ruiz <little_miry at yahoo.es>
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ *   2. GNU General Public License (GPL) V2 or any newer version
+ *   3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ */
+
+#ifndef _GOPLAY_UTF8_PARSER_H
+#define _GOPLAY_UTF8_PARSER_H
+
+#include <cstdlib>
+
+namespace utf8
+{
+	namespace parser
+	{
+
+		int char_to_utf8(unsigned long c, char *output);
+		int utf8_to_char(unsigned long *output, const char *input, int length);
+
+		int is_letter(long c);
+		int is_basechar(long c);
+		int is_ideographic(long c);
+		int is_combiningchar(long c);
+		int is_digit(long c);
+		int is_extender(long c);
+
+		int is_space(long c);
+
+		int is_xml11_namestartchar(long c);
+		int is_xml10_namestartchar(long c);
+		int is_xml11_namechar(long c);
+		int is_xml10_namechar(long c);
+
+		int is_nfc(const char *input, size_t length);
+		int check(const char *string, size_t length);
+
+	}							 // namespace utf8::parser
+}								 // namespace utf8
+#endif							 // _GOPLAY_UTF8_PARSER_H
diff --git a/utf8/testutf8.cpp b/utf8/testutf8.cpp
index 6197bf5..25f7ba0 100644
--- a/utf8/testutf8.cpp
+++ b/utf8/testutf8.cpp
@@ -1,6 +1,6 @@
 /*
- * Copyright (C)  2009  Miriam Ruiz <little_miry at yahoo.es>
- * Copyright (C)  2006 Nemanja Trifunovic
+ * Copyright (C) 2009  Miriam Ruiz <little_miry at yahoo.es>
+ * Copyright (C) 2006  Nemanja Trifunovic
  *
  * http://utfcpp.sourceforge.net/
  * http://sourceforge.net/projects/utfcpp
@@ -74,15 +74,6 @@ TEST_FUNCTION TestCuUTF8CPP(CuTest* tc)
 		CuAssertTrue(tc, w == twochars);
 	}
 
-	// utf8::previous (deprecated)
-	{
-		const char *twochars = "\xe6\x97\xa5\xd1\x88";
-		const char* w = twochars + 3;
-		int cp = utf8::previous (w, twochars - 1);
-		CuAssertTrue(tc, cp == 0x65e5);
-		CuAssertTrue(tc, w == twochars);
-	}
-
 	// utf8::advance
 	{
 		const char *twochars = "\xe6\x97\xa5\xd1\x88";
@@ -223,15 +214,6 @@ TEST_FUNCTION TestCuUTF8CPP(CuTest* tc)
 		CuAssertTrue(tc, w == twochars);
 	}
 
-	// utf8::unchecked::previous (deprecated)
-	{
-		const char *twochars = "\xe6\x97\xa5\xd1\x88";
-		const char *w = twochars + 3;
-		int cp = utf8::unchecked::previous (w);
-		CuAssertTrue(tc, cp == 0x65e5);
-		CuAssertTrue(tc, w == twochars);
-	}
-
 	// utf8::unchecked::advance
 	{
 		const char *twochars = "\xe6\x97\xa5\xd1\x88";
diff --git a/utf8/unchecked.h b/utf8/unchecked.h
index 918e6fe..882960c 100644
--- a/utf8/unchecked.h
+++ b/utf8/unchecked.h
@@ -105,13 +105,6 @@ namespace utf8
             return next(temp);
         }
 
-        // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
-        template <typename octet_iterator>
-        inline uint32_t previous(octet_iterator& it)
-        {
-            return prior(it);
-        }
-
         template <typename octet_iterator, typename distance_type>
         void advance (octet_iterator& it, distance_type n)
         {
diff --git a/utf8/utf8cpp.html b/utf8/utf8cpp.html
index 4ad7e10..a7bb0c0 100644
--- a/utf8/utf8cpp.html
+++ b/utf8/utf8cpp.html
@@ -295,7 +295,7 @@ assert (w == twochars + <span class="literal">3</span>);
       This function is typically used to iterate through a UTF-8 encoded string.
     </p>
     <p>
-      In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
+      In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
       thrown.
     </p>
     <h4>
@@ -335,7 +335,7 @@ assert (cp == <span class="literal">0x65e5</span>);
 assert (w == twochars);
 </pre>
     <p>
-      In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
+      In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
       thrown.
     </p>
     <h4>
@@ -345,7 +345,7 @@ assert (w == twochars);
     Available in version 1.02 and later.
     </p>
     <p>
-      Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+      Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
       decreases the iterator until it hits the beginning of the previous UTF-8 encoded
       code point and returns the 32 bits representation of the code point.
     </p>
@@ -403,7 +403,7 @@ assert (w == twochars);
     Deprecated in version 1.02 and later.
     </p>
     <p>
-      Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+      Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
       decreases the iterator until it hits the beginning of the previous UTF-8 encoded
       code point and returns the 32 bits representation of the code point.
     </p>
@@ -508,7 +508,7 @@ assert (w == twochars + <span class="literal">5</span>);
     Available in version 1.0 and later.
     </p>
     <p>
-      Given the iterators to two UTF-8 encoded code points in a seqence, returns the
+      Given the iterators to two UTF-8 encoded code points in a sequence, returns the
       number of code points between them.
     </p>
 <pre>
@@ -543,8 +543,8 @@ assert (dist == <span class="literal">2</span>);
       it looked better to model it after <code>std::distance</code> algorithm.
     </p>
     <p>
-      In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
-      thrown. If <code>last</code> does not point to the past-of-end of a UTF-8 seqence,
+      In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
+      thrown. If <code>last</code> does not point to the past-of-end of a UTF-8 sequence,
       a <code>utf8::not_enough_room</code> exception is thrown.
     </p>
     <h4>
@@ -630,8 +630,8 @@ assert (utf16result[<span class="literal">3</span>] == <span class=
 "literal">0xdd1e</span>);
 </pre>
     <p>
-      In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
-      thrown. If <code>end</code> does not point to the past-of-end of a UTF-8 seqence, a
+      In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
+      thrown. If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a
       <code>utf8::not_enough_room</code> exception is thrown.
     </p>
     <h4>
@@ -713,8 +713,8 @@ utf8to32(twochars, twochars + <span class=
 assert (utf32result.size() == <span class="literal">2</span>);
 </pre>
     <p>
-      In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
-      thrown. If <code>end</code> does not point to the past-of-end of a UTF-8 seqence, a
+      In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
+      thrown. If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a
       <code>utf8::not_enough_room</code> exception is thrown.
     </p>
     <h4>
@@ -789,7 +789,7 @@ assert (bvalid == false);
 </pre>
     <p>
       <code>is_valid</code> is a shorthand for <code>find_invalid(start, end) ==
-      end;</code>. You may want to use it to make sure that a byte seqence is a valid
+      end;</code>. You may want to use it to make sure that a byte sequence is a valid
       UTF-8 string without the need to know where it fails if it is not valid.
     </p>
     <h4>
@@ -1094,7 +1094,7 @@ assert (w == twochars);
     Available in version 1.02 and later.
     </p>
     <p>
-      Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+      Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
       decreases the iterator until it hits the beginning of the previous UTF-8 encoded
       code point and returns the 32 bits representation of the code point.
     </p>
@@ -1133,7 +1133,7 @@ assert (w == twochars);
     Deprecated in version 1.02 and later.
     </p>
     <p>
-      Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+      Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
       decreases the iterator until it hits the beginning of the previous UTF-8 encoded
       code point and returns the 32 bits representation of the code point.
     </p>
@@ -1219,7 +1219,7 @@ assert (w == twochars + <span class="literal">5</span>);
     Available in version 1.0 and later.
     </p>
     <p>
-      Given the iterators to two UTF-8 encoded code points in a seqence, returns the
+      Given the iterators to two UTF-8 encoded code points in a sequence, returns the
       number of code points between them.
     </p>
 <pre>

-- 
Development for GoFind!