[SCM] Development for GoFind! branch, master, updated. 56b0cd6ccd531af549aebd4f767ff77a21cd8949
Miriam Ruiz
miriam at debian.org
Wed May 6 15:46:19 UTC 2009
The following commit has been merged in the master branch:
commit 56b0cd6ccd531af549aebd4f767ff77a21cd8949
Author: Miriam Ruiz <miriam at debian.org>
Date: Wed May 6 17:53:33 2009 +0200
Improved parsing support for HTML entities in UTF8
Reestructured UTF8 related code
diff --git a/Makefile b/Makefile
index 06fc801..c432a64 100644
--- a/Makefile
+++ b/Makefile
@@ -26,17 +26,20 @@ PKGCONFIG_CFLAGS= `pkg-config $(PKGCONFIG_FILES) --cflags`
DEFS=-DUSE_GETTEXT
STATIC_CFLAGS= -O2 -g -Wall $(DEFS)
CFLAGS= $(STATIC_CFLAGS) -fPIC
+TEST_CFLAGS= -Wall -Werror $(CFLAGS)
LDFLAGS= -Wl,-z,defs -Wl,--as-needed -Wl,--no-undefined
PLUGINS_LDFLAGS=
LIBS= -lept -lept-core -lapt-pkg -lxapian -ldl `pkg-config $(PKGCONFIG_FILES) --libs`
OBJS= Engine.o Environment.o filter.o field.o gofind.o \
taghandler.o cfgmanager.o boolparser.o apthelper.o \
- utf8.o dll.o guiplugin.o pkgdata.o slre.o i18n.o
+ dll.o guiplugin.o pkgdata.o slre.o i18n.o \
+ utf8/parser.o utf8/html.o
LIB_OBJS= Engine.o Environment.o filter.o field.o \
taghandler.o cfgmanager.o boolparser.o apthelper.o \
- utf8.o dll.o guiplugin.o pkgdata.o slre.o i18n.o
+ dll.o guiplugin.o pkgdata.o slre.o i18n.o \
+ utf8/parser.o utf8/html.o
HEADERS=$(shell find . -name "*.h")
@@ -89,24 +92,25 @@ gui_lua.o: gui_lua.cpp
gui_luagtk.o: gui_luagtk.cpp
g++ -o $@ -c $< $(CFLAGS) $(PKGCONFIG_CFLAGS) `pkg-config lua5.1 --cflags`
-%.o: %.cpp $(HEADERS)
+%.o: %.cpp $(HEADERS) Makefile
g++ -o $@ -c $< $(CFLAGS) $(PKGCONFIG_CFLAGS)
-%.o: %.c $(HEADERS)
+%.o: %.c $(HEADERS) Makefile
gcc -o $@ -c $< $(CFLAGS) $(PKGCONFIG_CFLAGS)
%.so : %.o
g++ $(LDFLAGS) -shared $^ -o $@
-%.static.o: %.cpp $(HEADERS)
+%.static.o: %.cpp $(HEADERS) Makefile
g++ -o $@ -c $< $(STATIC_CFLAGS) $(PKGCONFIG_CFLAGS)
-%.static.o: %.c $(HEADERS)
+%.static.o: %.c $(HEADERS) Makefile
gcc -o $@ -c $< $(STATIC_CFLAGS) $(PKGCONFIG_CFLAGS)
TEST_OBJS= filter.test.o taghandler.test.o cfgmanager.test.o \
- boolparser.test.o slre.test.o utf8.test.o apthelper.test.o \
- dll.test.o i18n.test.o utf8/testutf8.test.o CuTest.o test.o
+ boolparser.test.o slre.test.o apthelper.test.o \
+ dll.test.o i18n.test.o utf8/testutf8.test.o \
+ CuTest.o test.o utf8/parser.test.o utf8/html.test.o
TEST_PLUGINS= testplugin.test.so
test: $(TEST_OBJS) $(TEST_PLUGINS)
@@ -115,17 +119,17 @@ test: $(TEST_OBJS) $(TEST_PLUGINS)
test.c:
sh CuTest.sh > $@
-test.o: test.c $(HEADERS)
- gcc -o $@ -DUNIT_TEST -c $< $(CFLAGS) $(PKGCONFIG_CFLAGS)
+test.o: test.c $(HEADERS) Makefile
+ gcc -o $@ -DUNIT_TEST -c $< $(TEST_CFLAGS) $(PKGCONFIG_CFLAGS)
-%.test.o: %.cpp $(HEADERS)
- g++ -o $@ -DUNIT_TEST -c $< -Wall -Werror $(CFLAGS) $(PKGCONFIG_CFLAGS)
+%.test.o: %.cpp $(HEADERS) Makefile
+ g++ -o $@ -DUNIT_TEST -c $< $(TEST_CFLAGS) $(PKGCONFIG_CFLAGS)
-%.test.o: %.c $(HEADERS)
- gcc -o $@ -DUNIT_TEST -c $< -Wall -Werror $(CFLAGS) $(PKGCONFIG_CFLAGS)
+%.test.o: %.c $(HEADERS) Makefile
+ gcc -o $@ -DUNIT_TEST -c $< $(TEST_CFLAGS) $(PKGCONFIG_CFLAGS)
%.test.so : %.test.o
g++ $(CFLAGS) $(PKGCONFIG_CFLAGS) -DUNIT_TEST -shared $^ -o $@
clean:
- rm -f gofind test test.c cli/*.o fltk/*.o *.o *.so *.so* *.a
+ rm -f gofind test test.c utf8/*.o cli/*.o fltk/*.o *.o *.so *.so* *.a
diff --git a/boolparser.cpp b/boolparser.cpp
index 461de07..b3462e8 100644
--- a/boolparser.cpp
+++ b/boolparser.cpp
@@ -23,10 +23,6 @@
#include "common.h"
#include "boolparser.h"
-#ifdef USE_UTF8
-#include "utf8.h"
-#endif
-
#include <cassert>
#include <memory>
#include <cstring>
diff --git a/gofind.cpp b/gofind.cpp
index b0413b4..dfd3d6b 100644
--- a/gofind.cpp
+++ b/gofind.cpp
@@ -2,7 +2,7 @@
* debtags - Implement package tags support for Debian
*
* Copyright (C) 2007 Enrico Zini <enrico at debian.org>
- * Copyright (C) 2007, 2008 Miriam Ruiz <little_miry at yahoo.es>
+ * Copyright (C) 2007-2009 Miriam Ruiz <little_miry at yahoo.es>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
diff --git a/slre.c b/slre.c
index ac50590..b8bd4a9 100644
--- a/slre.c
+++ b/slre.c
@@ -21,6 +21,8 @@
/*
* Copyright (c) 2004-2005 Sergey Lyubka <valenok at gmail.com>
* All rights reserved
+ *
+ * Webpage: http://slre.sourceforge.net/
*
* "THE BEER-WARE LICENSE" (Revision 42):
* Sergey Lyubka wrote this file. As long as you retain this notice you
@@ -28,15 +30,29 @@
* this stuff is worth it, you can buy me a beer in return.
*/
-/* Webpage: http://slre.sourceforge.net/ */
+/*
+ *
+ * Functions helper_utf8_to_unicode_char, helper_unicode_is_space and helper_unicode_is_digit
+ *
+ * helper_utf8.c - Raptor UTF-8 and Unicode support
+ *
+ * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
+ * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ * 2. GNU General Public License (GPL) V2 or any newer version
+ * 3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ */
#include "common.h"
#include "slre.h"
-#ifdef USE_UTF8
-#include "utf8.h"
-#endif
-
#include <stdio.h>
#include <assert.h>
#include <ctype.h>
@@ -48,6 +64,172 @@
enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
+#ifdef USE_UTF8
+/**
+ * helper_utf8_to_unicode_char:
+ * @output: Pointer to the Unicode character or NULL
+ * @input: UTF-8 string buffer
+ * @length: buffer size
+ *
+ * Convert an UTF-8 encoded buffer to a Unicode character.
+ *
+ * If output is NULL, then will calculate the number of bytes that
+ * will be used from the input buffer and not perform the conversion.
+ *
+ * Return value: bytes used from input buffer or <0 on failure:
+ * -1 input buffer too short or length error
+ * -2 overlong UTF-8 sequence
+ * -3 illegal code positions
+ * -4 code out of range U+0000 to U+10FFFF.
+ * In cases -2, -3 and -4 the coded character is stored in the output.
+ */
+static int helper_utf8_to_unicode_char(unsigned long *output, const unsigned char *input, int length)
+{
+ unsigned char in;
+ int size;
+ unsigned long c=0;
+
+ if(length < 1)
+ return -1;
+
+ in=*input++;
+ if((in & 0x80) == 0)
+ {
+ size=1;
+ c= in & 0x7f;
+ }
+ else if((in & 0xe0) == 0xc0)
+ {
+ size=2;
+ c= in & 0x1f;
+ }
+ else if((in & 0xf0) == 0xe0)
+ {
+ size=3;
+ c= in & 0x0f;
+ }
+ else if((in & 0xf8) == 0xf0)
+ {
+ size=4;
+ c = in & 0x07;
+ }
+ else if((in & 0xfc) == 0xf8)
+ {
+ size=5;
+ c = in & 0x03;
+ }
+ else if((in & 0xfe) == 0xfc)
+ {
+ size=6;
+ c = in & 0x01;
+ } else
+ return -1;
+
+ if(!output)
+ return size;
+
+ if(length < size)
+ return -1;
+
+ switch(size)
+ {
+ case 6:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 5:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 4:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 3:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 2:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ default:
+ break;
+ }
+
+ *output=c;
+
+ /* check for overlong UTF-8 sequences */
+ switch(size)
+ {
+ case 2:
+ if(c < 0x00000080)
+ return -2;
+ break;
+ case 3:
+ if(c < 0x00000800)
+ return -2;
+ break;
+ case 4:
+ if(c < 0x00010000)
+ return -2;
+ break;
+
+ default: /* 1 */
+ break;
+ }
+
+ /* check for illegal code positions:
+ * U+D800 to U+DFFF (UTF-16 surrogates)
+ * U+FFFE and U+FFFF
+ */
+ if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
+ return -3;
+
+ /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
+ /* of course this makes some 4 byte forms illegal */
+ if(c > 0x10ffff)
+ return -4;
+
+ return size;
+}
+
+static int helper_unicode_is_space(long c)
+{
+ return((c == 0x0020) || /* Space */
+ (c == 0x000C) || /* Page jump: \f */
+ (c == 0x000D) || /* Carriage return: \r */
+ (c == 0x000A) || /* Next line: \n */
+ (c == 0x0009) || /* Horizontal tab: \t */
+ (c == 0x000B) ); /* Vertical tab \v */
+}
+
+static int helper_unicode_is_digit(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
+ return((c >= 0x0030 && c <= 0x0039 ) ||
+ (c >= 0x0660 && c <= 0x0669 ) ||
+ (c >= 0x06F0 && c <= 0x06F9 ) ||
+ (c >= 0x0966 && c <= 0x096F ) ||
+ (c >= 0x09E6 && c <= 0x09EF ) ||
+ (c >= 0x0A66 && c <= 0x0A6F ) ||
+ (c >= 0x0AE6 && c <= 0x0AEF ) ||
+ (c >= 0x0B66 && c <= 0x0B6F ) ||
+ (c >= 0x0BE7 && c <= 0x0BEF ) ||
+ (c >= 0x0C66 && c <= 0x0C6F ) ||
+ (c >= 0x0CE6 && c <= 0x0CEF ) ||
+ (c >= 0x0D66 && c <= 0x0D6F ) ||
+ (c >= 0x0E50 && c <= 0x0E59 ) ||
+ (c >= 0x0ED0 && c <= 0x0ED9 ) ||
+ (c >= 0x0F20 && c <= 0x0F29 ));
+}
+#endif // USE_UTF8
+
static struct {
const char *name;
int narg;
diff --git a/utf8.c b/utf8.c
deleted file mode 100644
index 40a5532..0000000
--- a/utf8.c
+++ /dev/null
@@ -1,768 +0,0 @@
-/*
- * helper_utf8.c - Raptor UTF-8 and Unicode support
- *
- * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
- * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
- *
- * This package is Free Software and part of Redland http://librdf.org/
- *
- * It is licensed under the following three licenses as alternatives:
- * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
- * 2. GNU General Public License (GPL) V2 or any newer version
- * 3. Apache License, V2.0 or any newer version
- *
- * You may not use this file except in compliance with at least one of
- * the above three licenses.
- */
-
-#include "common.h"
-#include "utf8.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-
-
-/**
- * helper_unicode_char_to_utf8:
- * @c: Unicode character
- * @output: UTF-8 string buffer or NULL
- *
- * Convert a Unicode character to UTF-8 encoding.
- *
- * Based on librdf_unicode_char_to_utf8() with no need to calculate
- * length since the encoded character is always copied into a buffer
- * with sufficient size.
- *
- * Return value: bytes encoded to output buffer or <0 on failure
- **/
-int helper_unicode_char_to_utf8(unsigned long c, unsigned char *output)
-{
- int size=0;
-
- if (c < 0x00000080)
- size=1;
- else if (c < 0x00000800)
- size=2;
- else if (c < 0x00010000)
- size=3;
- else if (c < 0x00200000)
- size=4;
- else if (c < 0x04000000)
- size=5;
- else if (c < 0x80000000)
- size=6;
- else
- return -1;
-
- switch(size) {
- case 6:
- output[5]=0x80 | (unsigned char)(c & 0x3F);
- c= c >> 6;
- /* set bit 2 (bits 7,6,5,4,3,2 less 7,6,5,4,3 set below) on last byte */
- c |= 0x4000000; /* 0x10000 = 0x04 << 24 */
- /* FALLTHROUGH */
- case 5:
- output[4]=0x80 | (unsigned char)(c & 0x3F);
- c= c >> 6;
- /* set bit 3 (bits 7,6,5,4,3 less 7,6,5,4 set below) on last byte */
- c |= 0x200000; /* 0x10000 = 0x08 << 18 */
- /* FALLTHROUGH */
- case 4:
- output[3]=0x80 | (unsigned char)(c & 0x3F);
- c= c >> 6;
- /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
- c |= 0x10000; /* 0x10000 = 0x10 << 12 */
- /* FALLTHROUGH */
- case 3:
- output[2]=0x80 | (unsigned char)(c & 0x3F);
- c= c >> 6;
- /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
- c |= 0x800; /* 0x800 = 0x20 << 6 */
- /* FALLTHROUGH */
- case 2:
- output[1]=0x80 | (unsigned char)(c & 0x3F);
- c= c >> 6;
- /* set bits 7,6 on last byte */
- c |= 0xc0;
- /* FALLTHROUGH */
- case 1:
- output[0]=(unsigned char)c;
- }
-
- return size;
-}
-
-
-/**
- * helper_utf8_to_unicode_char:
- * @output: Pointer to the Unicode character or NULL
- * @input: UTF-8 string buffer
- * @length: buffer size
- *
- * Convert an UTF-8 encoded buffer to a Unicode character.
- *
- * If output is NULL, then will calculate the number of bytes that
- * will be used from the input buffer and not perform the conversion.
- *
- * Return value: bytes used from input buffer or <0 on failure:
- * -1 input buffer too short or length error
- * -2 overlong UTF-8 sequence
- * -3 illegal code positions
- * -4 code out of range U+0000 to U+10FFFF.
- * In cases -2, -3 and -4 the coded character is stored in the output.
- */
-int helper_utf8_to_unicode_char(unsigned long *output,
- const unsigned char *input, int length)
-{
- unsigned char in;
- int size;
- unsigned long c=0;
-
- if(length < 1)
- return -1;
-
- in=*input++;
- if((in & 0x80) == 0) {
- size=1;
- c= in & 0x7f;
- } else if((in & 0xe0) == 0xc0) {
- size=2;
- c= in & 0x1f;
- } else if((in & 0xf0) == 0xe0) {
- size=3;
- c= in & 0x0f;
- } else if((in & 0xf8) == 0xf0) {
- size=4;
- c = in & 0x07;
- } else if((in & 0xfc) == 0xf8) {
- size=5;
- c = in & 0x03;
- } else if((in & 0xfe) == 0xfc) {
- size=6;
- c = in & 0x01;
- } else
- return -1;
-
-
- if(!output)
- return size;
-
- if(length < size)
- return -1;
-
- switch(size) {
- case 6:
- in=*input++ & 0x3f;
- c= c << 6;
- c |= in;
- /* FALLTHROUGH */
- case 5:
- in=*input++ & 0x3f;
- c= c << 6;
- c |= in;
- /* FALLTHROUGH */
- case 4:
- in=*input++ & 0x3f;
- c= c << 6;
- c |= in;
- /* FALLTHROUGH */
- case 3:
- in=*input++ & 0x3f;
- c= c << 6;
- c |= in;
- /* FALLTHROUGH */
- case 2:
- in=*input++ & 0x3f;
- c= c << 6;
- c |= in;
- /* FALLTHROUGH */
- default:
- break;
- }
-
- *output=c;
-
- /* check for overlong UTF-8 sequences */
- switch(size) {
- case 2:
- if(c < 0x00000080)
- return -2;
- break;
- case 3:
- if(c < 0x00000800)
- return -2;
- break;
- case 4:
- if(c < 0x00010000)
- return -2;
- break;
-
- default: /* 1 */
- break;
- }
-
-
- /* check for illegal code positions:
- * U+D800 to U+DFFF (UTF-16 surrogates)
- * U+FFFE and U+FFFF
- */
- if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
- return -3;
-
- /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
- /* of course this makes some 4 byte forms illegal */
- if(c > 0x10ffff)
- return -4;
-
- return size;
-}
-
-
-/**
- * helper_unicode_is_xml11_namestartchar:
- * @c: Unicode character to check
- *
- * Check if Unicode character is legal to start an XML 1.1 Name
- *
- * Namespaces in XML 1.1 REC 2004-02-04
- * http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
- * updating
- * Extensible Markup Language (XML) 1.1 REC 2004-02-04
- * http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
- * excluding the ':'
- *
- * Return value: non-0 if legal
- **/
-int helper_unicode_is_xml11_namestartchar(long c)
-{
- return (((c >= 0x0041) && (c <= 0x005A)) || /* [A-Z] */
- (c == 0x005F) || /* '_' */
- ((c >= 0x0061) && (c <= 0x007A)) || /* [a-z] */
- ((c >= 0x00C0) && (c <= 0x00D6)) ||
- ((c >= 0x00D8) && (c <= 0x00F6)) ||
- ((c >= 0x00F8) && (c <= 0x02FF)) ||
- ((c >= 0x0370) && (c <= 0x037D)) ||
- ((c >= 0x037F) && (c <= 0x1FFF)) ||
- ((c >= 0x200C) && (c <= 0x200D)) ||
- ((c >= 0x2070) && (c <= 0x218F)) ||
- ((c >= 0x2C00) && (c <= 0x2FEF)) ||
- ((c >= 0x3001) && (c <= 0xD7FF)) ||
- ((c >= 0xF900) && (c <= 0xFDCF)) ||
- ((c >= 0xFDF0) && (c <= 0xFFFD)) ||
- ((c >= 0x10000) && (c <= 0xEFFFF)));
-}
-
-
-/**
- * helper_unicode_is_xml10_namestartchar:
- * @c: Unicode character to check
- *
- * Check if Unicode character is legal to start an XML 1.0 Name
- *
- * Namespaces in XML REC 1999-01-14
- * http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
- * updating
- * Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
- * http://www.w3.org/TR/2004/REC-xml-20040204/
- * excluding the ':'
- *
- * Return value: non-0 if legal
- **/
-int helper_unicode_is_xml10_namestartchar(long c)
-{
- return (helper_unicode_is_letter(c) ||
- (c == '_'));
-}
-
-
-/**
- * helper_unicode_is_xml11_namechar:
- * @c: Unicode character
- *
- * Check if a Unicode codepoint is a legal to continue an XML 1.1 Name
- *
- * Namespaces in XML 1.1 REC 2004-02-04
- * http://www.w3.org/TR/2004/REC-xml11-20040204/
- * updating
- * Extensible Markup Language (XML) 1.1 REC 2004-02-04
- * http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
- * excluding the ':'
- *
- * Return value: non-0 if legal
- **/
-int helper_unicode_is_xml11_namechar(long c)
-{
- return (helper_unicode_is_xml11_namestartchar(c) ||
- (c == 0x002D) || /* '-' */
- (c == 0x002E) || /* '.' */
- (c >= 0x0030 && c <= 0x0039) || /* 0-9 */
- (c == 0x00B7) ||
- (c >= 0x0300 && c <=0x036F) ||
- (c >= 0x203F && c <=0x2040));
-}
-
-
-/**
- * helper_unicode_is_xml10_namechar:
- * @c: Unicode character
- *
- * Check if a Unicode codepoint is a legal to continue an XML 1.0 Name
- *
- * Namespaces in XML REC 1999-01-14
- * http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCNameChar
- * updating
- * Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
- * http://www.w3.org/TR/2004/REC-xml-20040204/
- * excluding the ':'
- *
- * Return value: non-0 if legal
- **/
-int helper_unicode_is_xml10_namechar(long c)
-{
- return (helper_unicode_is_letter(c) ||
- helper_unicode_is_digit(c) ||
- (c == 0x002E) || /* '.' */
- (c == 0x002D) || /* '-' */
- (c == 0x005F) || /* '_' */
- helper_unicode_is_combiningchar(c) ||
- helper_unicode_is_extender(c));
-}
-
-
-/*
- * All this below was derived by machine-transforming the classes in Appendix B
- * of http://www.w3.org/TR/2000/REC-xml-20001006
- */
-
-int helper_unicode_is_letter(long c)
-{
- return(helper_unicode_is_basechar(c) ||
- helper_unicode_is_ideographic(c));
-}
-
-
-int helper_unicode_is_basechar(long c)
-{
- /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-BaseChar */
- return((c >= 0x0041 && c <= 0x005A ) ||
- (c >= 0x0061 && c <= 0x007A ) ||
- (c >= 0x00C0 && c <= 0x00D6 ) ||
- (c >= 0x00D8 && c <= 0x00F6 ) ||
- (c >= 0x00F8 && c <= 0x00FF ) ||
- (c >= 0x0100 && c <= 0x0131 ) ||
- (c >= 0x0134 && c <= 0x013E ) ||
- (c >= 0x0141 && c <= 0x0148 ) ||
- (c >= 0x014A && c <= 0x017E ) ||
- (c >= 0x0180 && c <= 0x01C3 ) ||
- (c >= 0x01CD && c <= 0x01F0 ) ||
- (c >= 0x01F4 && c <= 0x01F5 ) ||
- (c >= 0x01FA && c <= 0x0217 ) ||
- (c >= 0x0250 && c <= 0x02A8 ) ||
- (c >= 0x02BB && c <= 0x02C1 ) ||
- (c == 0x0386) ||
- (c >= 0x0388 && c <= 0x038A ) ||
- (c == 0x038C) ||
- (c >= 0x038E && c <= 0x03A1 ) ||
- (c >= 0x03A3 && c <= 0x03CE ) ||
- (c >= 0x03D0 && c <= 0x03D6 ) ||
- (c == 0x03DA) ||
- (c == 0x03DC) ||
- (c == 0x03DE) ||
- (c == 0x03E0) ||
- (c >= 0x03E2 && c <= 0x03F3 ) ||
- (c >= 0x0401 && c <= 0x040C ) ||
- (c >= 0x040E && c <= 0x044F ) ||
- (c >= 0x0451 && c <= 0x045C ) ||
- (c >= 0x045E && c <= 0x0481 ) ||
- (c >= 0x0490 && c <= 0x04C4 ) ||
- (c >= 0x04C7 && c <= 0x04C8 ) ||
- (c >= 0x04CB && c <= 0x04CC ) ||
- (c >= 0x04D0 && c <= 0x04EB ) ||
- (c >= 0x04EE && c <= 0x04F5 ) ||
- (c >= 0x04F8 && c <= 0x04F9 ) ||
- (c >= 0x0531 && c <= 0x0556 ) ||
- (c == 0x0559) ||
- (c >= 0x0561 && c <= 0x0586 ) ||
- (c >= 0x05D0 && c <= 0x05EA ) ||
- (c >= 0x05F0 && c <= 0x05F2 ) ||
- (c >= 0x0621 && c <= 0x063A ) ||
- (c >= 0x0641 && c <= 0x064A ) ||
- (c >= 0x0671 && c <= 0x06B7 ) ||
- (c >= 0x06BA && c <= 0x06BE ) ||
- (c >= 0x06C0 && c <= 0x06CE ) ||
- (c >= 0x06D0 && c <= 0x06D3 ) ||
- (c == 0x06D5) ||
- (c >= 0x06E5 && c <= 0x06E6 ) ||
- (c >= 0x0905 && c <= 0x0939 ) ||
- (c == 0x093D) ||
- (c >= 0x0958 && c <= 0x0961 ) ||
- (c >= 0x0985 && c <= 0x098C ) ||
- (c >= 0x098F && c <= 0x0990 ) ||
- (c >= 0x0993 && c <= 0x09A8 ) ||
- (c >= 0x09AA && c <= 0x09B0 ) ||
- (c == 0x09B2) ||
- (c >= 0x09B6 && c <= 0x09B9 ) ||
- (c >= 0x09DC && c <= 0x09DD ) ||
- (c >= 0x09DF && c <= 0x09E1 ) ||
- (c >= 0x09F0 && c <= 0x09F1 ) ||
- (c >= 0x0A05 && c <= 0x0A0A ) ||
- (c >= 0x0A0F && c <= 0x0A10 ) ||
- (c >= 0x0A13 && c <= 0x0A28 ) ||
- (c >= 0x0A2A && c <= 0x0A30 ) ||
- (c >= 0x0A32 && c <= 0x0A33 ) ||
- (c >= 0x0A35 && c <= 0x0A36 ) ||
- (c >= 0x0A38 && c <= 0x0A39 ) ||
- (c >= 0x0A59 && c <= 0x0A5C ) ||
- (c == 0x0A5E) ||
- (c >= 0x0A72 && c <= 0x0A74 ) ||
- (c >= 0x0A85 && c <= 0x0A8B ) ||
- (c == 0x0A8D) ||
- (c >= 0x0A8F && c <= 0x0A91 ) ||
- (c >= 0x0A93 && c <= 0x0AA8 ) ||
- (c >= 0x0AAA && c <= 0x0AB0 ) ||
- (c >= 0x0AB2 && c <= 0x0AB3 ) ||
- (c >= 0x0AB5 && c <= 0x0AB9 ) ||
- (c == 0x0ABD) ||
- (c == 0x0AE0) ||
- (c >= 0x0B05 && c <= 0x0B0C ) ||
- (c >= 0x0B0F && c <= 0x0B10 ) ||
- (c >= 0x0B13 && c <= 0x0B28 ) ||
- (c >= 0x0B2A && c <= 0x0B30 ) ||
- (c >= 0x0B32 && c <= 0x0B33 ) ||
- (c >= 0x0B36 && c <= 0x0B39 ) ||
- (c == 0x0B3D) ||
- (c >= 0x0B5C && c <= 0x0B5D ) ||
- (c >= 0x0B5F && c <= 0x0B61 ) ||
- (c >= 0x0B85 && c <= 0x0B8A ) ||
- (c >= 0x0B8E && c <= 0x0B90 ) ||
- (c >= 0x0B92 && c <= 0x0B95 ) ||
- (c >= 0x0B99 && c <= 0x0B9A ) ||
- (c == 0x0B9C) ||
- (c >= 0x0B9E && c <= 0x0B9F ) ||
- (c >= 0x0BA3 && c <= 0x0BA4 ) ||
- (c >= 0x0BA8 && c <= 0x0BAA ) ||
- (c >= 0x0BAE && c <= 0x0BB5 ) ||
- (c >= 0x0BB7 && c <= 0x0BB9 ) ||
- (c >= 0x0C05 && c <= 0x0C0C ) ||
- (c >= 0x0C0E && c <= 0x0C10 ) ||
- (c >= 0x0C12 && c <= 0x0C28 ) ||
- (c >= 0x0C2A && c <= 0x0C33 ) ||
- (c >= 0x0C35 && c <= 0x0C39 ) ||
- (c >= 0x0C60 && c <= 0x0C61 ) ||
- (c >= 0x0C85 && c <= 0x0C8C ) ||
- (c >= 0x0C8E && c <= 0x0C90 ) ||
- (c >= 0x0C92 && c <= 0x0CA8 ) ||
- (c >= 0x0CAA && c <= 0x0CB3 ) ||
- (c >= 0x0CB5 && c <= 0x0CB9 ) ||
- (c == 0x0CDE) ||
- (c >= 0x0CE0 && c <= 0x0CE1 ) ||
- (c >= 0x0D05 && c <= 0x0D0C ) ||
- (c >= 0x0D0E && c <= 0x0D10 ) ||
- (c >= 0x0D12 && c <= 0x0D28 ) ||
- (c >= 0x0D2A && c <= 0x0D39 ) ||
- (c >= 0x0D60 && c <= 0x0D61 ) ||
- (c >= 0x0E01 && c <= 0x0E2E ) ||
- (c == 0x0E30) ||
- (c >= 0x0E32 && c <= 0x0E33 ) ||
- (c >= 0x0E40 && c <= 0x0E45 ) ||
- (c >= 0x0E81 && c <= 0x0E82 ) ||
- (c == 0x0E84) ||
- (c >= 0x0E87 && c <= 0x0E88 ) ||
- (c == 0x0E8A) ||
- (c == 0x0E8D) ||
- (c >= 0x0E94 && c <= 0x0E97 ) ||
- (c >= 0x0E99 && c <= 0x0E9F ) ||
- (c >= 0x0EA1 && c <= 0x0EA3 ) ||
- (c == 0x0EA5) ||
- (c == 0x0EA7) ||
- (c >= 0x0EAA && c <= 0x0EAB ) ||
- (c >= 0x0EAD && c <= 0x0EAE ) ||
- (c == 0x0EB0) ||
- (c >= 0x0EB2 && c <= 0x0EB3 ) ||
- (c == 0x0EBD) ||
- (c >= 0x0EC0 && c <= 0x0EC4 ) ||
- (c >= 0x0F40 && c <= 0x0F47 ) ||
- (c >= 0x0F49 && c <= 0x0F69 ) ||
- (c >= 0x10A0 && c <= 0x10C5 ) ||
- (c >= 0x10D0 && c <= 0x10F6 ) ||
- (c == 0x1100) ||
- (c >= 0x1102 && c <= 0x1103 ) ||
- (c >= 0x1105 && c <= 0x1107 ) ||
- (c == 0x1109) ||
- (c >= 0x110B && c <= 0x110C ) ||
- (c >= 0x110E && c <= 0x1112 ) ||
- (c == 0x113C) ||
- (c == 0x113E) ||
- (c == 0x1140) ||
- (c == 0x114C) ||
- (c == 0x114E) ||
- (c == 0x1150) ||
- (c >= 0x1154 && c <= 0x1155 ) ||
- (c == 0x1159) ||
- (c >= 0x115F && c <= 0x1161 ) ||
- (c == 0x1163) ||
- (c == 0x1165) ||
- (c == 0x1167) ||
- (c == 0x1169) ||
- (c >= 0x116D && c <= 0x116E ) ||
- (c >= 0x1172 && c <= 0x1173 ) ||
- (c == 0x1175) ||
- (c == 0x119E) ||
- (c == 0x11A8) ||
- (c == 0x11AB) ||
- (c >= 0x11AE && c <= 0x11AF ) ||
- (c >= 0x11B7 && c <= 0x11B8 ) ||
- (c == 0x11BA) ||
- (c >= 0x11BC && c <= 0x11C2 ) ||
- (c == 0x11EB) ||
- (c == 0x11F0) ||
- (c == 0x11F9) ||
- (c >= 0x1E00 && c <= 0x1E9B ) ||
- (c >= 0x1EA0 && c <= 0x1EF9 ) ||
- (c >= 0x1F00 && c <= 0x1F15 ) ||
- (c >= 0x1F18 && c <= 0x1F1D ) ||
- (c >= 0x1F20 && c <= 0x1F45 ) ||
- (c >= 0x1F48 && c <= 0x1F4D ) ||
- (c >= 0x1F50 && c <= 0x1F57 ) ||
- (c == 0x1F59) ||
- (c == 0x1F5B) ||
- (c == 0x1F5D) ||
- (c >= 0x1F5F && c <= 0x1F7D ) ||
- (c >= 0x1F80 && c <= 0x1FB4 ) ||
- (c >= 0x1FB6 && c <= 0x1FBC ) ||
- (c == 0x1FBE) ||
- (c >= 0x1FC2 && c <= 0x1FC4 ) ||
- (c >= 0x1FC6 && c <= 0x1FCC ) ||
- (c >= 0x1FD0 && c <= 0x1FD3 ) ||
- (c >= 0x1FD6 && c <= 0x1FDB ) ||
- (c >= 0x1FE0 && c <= 0x1FEC ) ||
- (c >= 0x1FF2 && c <= 0x1FF4 ) ||
- (c >= 0x1FF6 && c <= 0x1FFC ) ||
- (c == 0x2126) ||
- (c >= 0x212A && c <= 0x212B ) ||
- (c == 0x212E) ||
- (c >= 0x2180 && c <= 0x2182 ) ||
- (c >= 0x3041 && c <= 0x3094 ) ||
- (c >= 0x30A1 && c <= 0x30FA ) ||
- (c >= 0x3105 && c <= 0x312C ) ||
- (c >= 0xAC00 && c <= 0xD7A3 )
- );
-}
-
-
-int helper_unicode_is_ideographic(long c)
-{
- /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Ideographic */
- return((c >= 0x4E00 && c <= 0x9FA5 ) ||
- (c == 0x3007) ||
- (c >= 0x3021 && c <= 0x3029 ));
-}
-
-
-int helper_unicode_is_combiningchar(long c)
-{
- /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-CombiningChar */
- return((c >= 0x0300 && c <= 0x0345 ) ||
- (c >= 0x0360 && c <= 0x0361 ) ||
- (c >= 0x0483 && c <= 0x0486 ) ||
- (c >= 0x0591 && c <= 0x05A1 ) ||
- (c >= 0x05A3 && c <= 0x05B9 ) ||
- (c >= 0x05BB && c <= 0x05BD ) ||
- (c == 0x05BF) ||
- (c >= 0x05C1 && c <= 0x05C2 ) ||
- (c == 0x05C4) ||
- (c >= 0x064B && c <= 0x0652 ) ||
- (c == 0x0670) ||
- (c >= 0x06D6 && c <= 0x06DC ) ||
- (c >= 0x06DD && c <= 0x06DF ) ||
- (c >= 0x06E0 && c <= 0x06E4 ) ||
- (c >= 0x06E7 && c <= 0x06E8 ) ||
- (c >= 0x06EA && c <= 0x06ED ) ||
- (c >= 0x0901 && c <= 0x0903 ) ||
- (c == 0x093C) ||
- (c >= 0x093E && c <= 0x094C ) ||
- (c == 0x094D) ||
- (c >= 0x0951 && c <= 0x0954 ) ||
- (c >= 0x0962 && c <= 0x0963 ) ||
- (c >= 0x0981 && c <= 0x0983 ) ||
- (c == 0x09BC) ||
- (c == 0x09BE) ||
- (c == 0x09BF) ||
- (c >= 0x09C0 && c <= 0x09C4 ) ||
- (c >= 0x09C7 && c <= 0x09C8 ) ||
- (c >= 0x09CB && c <= 0x09CD ) ||
- (c == 0x09D7) ||
- (c >= 0x09E2 && c <= 0x09E3 ) ||
- (c == 0x0A02) ||
- (c == 0x0A3C) ||
- (c == 0x0A3E) ||
- (c == 0x0A3F) ||
- (c >= 0x0A40 && c <= 0x0A42 ) ||
- (c >= 0x0A47 && c <= 0x0A48 ) ||
- (c >= 0x0A4B && c <= 0x0A4D ) ||
- (c >= 0x0A70 && c <= 0x0A71 ) ||
- (c >= 0x0A81 && c <= 0x0A83 ) ||
- (c == 0x0ABC) ||
- (c >= 0x0ABE && c <= 0x0AC5 ) ||
- (c >= 0x0AC7 && c <= 0x0AC9 ) ||
- (c >= 0x0ACB && c <= 0x0ACD ) ||
- (c >= 0x0B01 && c <= 0x0B03 ) ||
- (c == 0x0B3C) ||
- (c >= 0x0B3E && c <= 0x0B43 ) ||
- (c >= 0x0B47 && c <= 0x0B48 ) ||
- (c >= 0x0B4B && c <= 0x0B4D ) ||
- (c >= 0x0B56 && c <= 0x0B57 ) ||
- (c >= 0x0B82 && c <= 0x0B83 ) ||
- (c >= 0x0BBE && c <= 0x0BC2 ) ||
- (c >= 0x0BC6 && c <= 0x0BC8 ) ||
- (c >= 0x0BCA && c <= 0x0BCD ) ||
- (c == 0x0BD7) ||
- (c >= 0x0C01 && c <= 0x0C03 ) ||
- (c >= 0x0C3E && c <= 0x0C44 ) ||
- (c >= 0x0C46 && c <= 0x0C48 ) ||
- (c >= 0x0C4A && c <= 0x0C4D ) ||
- (c >= 0x0C55 && c <= 0x0C56 ) ||
- (c >= 0x0C82 && c <= 0x0C83 ) ||
- (c >= 0x0CBE && c <= 0x0CC4 ) ||
- (c >= 0x0CC6 && c <= 0x0CC8 ) ||
- (c >= 0x0CCA && c <= 0x0CCD ) ||
- (c >= 0x0CD5 && c <= 0x0CD6 ) ||
- (c >= 0x0D02 && c <= 0x0D03 ) ||
- (c >= 0x0D3E && c <= 0x0D43 ) ||
- (c >= 0x0D46 && c <= 0x0D48 ) ||
- (c >= 0x0D4A && c <= 0x0D4D ) ||
- (c == 0x0D57) ||
- (c == 0x0E31) ||
- (c >= 0x0E34 && c <= 0x0E3A ) ||
- (c >= 0x0E47 && c <= 0x0E4E ) ||
- (c == 0x0EB1) ||
- (c >= 0x0EB4 && c <= 0x0EB9 ) ||
- (c >= 0x0EBB && c <= 0x0EBC ) ||
- (c >= 0x0EC8 && c <= 0x0ECD ) ||
- (c >= 0x0F18 && c <= 0x0F19 ) ||
- (c == 0x0F35) ||
- (c == 0x0F37) ||
- (c == 0x0F39) ||
- (c == 0x0F3E) ||
- (c == 0x0F3F) ||
- (c >= 0x0F71 && c <= 0x0F84 ) ||
- (c >= 0x0F86 && c <= 0x0F8B ) ||
- (c >= 0x0F90 && c <= 0x0F95 ) ||
- (c == 0x0F97) ||
- (c >= 0x0F99 && c <= 0x0FAD ) ||
- (c >= 0x0FB1 && c <= 0x0FB7 ) ||
- (c == 0x0FB9) ||
- (c >= 0x20D0 && c <= 0x20DC ) ||
- (c == 0x20E1) ||
- (c >= 0x302A && c <= 0x302F ) ||
- (c == 0x3099) ||
- (c == 0x309A));
-}
-
-
-int helper_unicode_is_digit(long c)
-{
- /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
- return((c >= 0x0030 && c <= 0x0039 ) ||
- (c >= 0x0660 && c <= 0x0669 ) ||
- (c >= 0x06F0 && c <= 0x06F9 ) ||
- (c >= 0x0966 && c <= 0x096F ) ||
- (c >= 0x09E6 && c <= 0x09EF ) ||
- (c >= 0x0A66 && c <= 0x0A6F ) ||
- (c >= 0x0AE6 && c <= 0x0AEF ) ||
- (c >= 0x0B66 && c <= 0x0B6F ) ||
- (c >= 0x0BE7 && c <= 0x0BEF ) ||
- (c >= 0x0C66 && c <= 0x0C6F ) ||
- (c >= 0x0CE6 && c <= 0x0CEF ) ||
- (c >= 0x0D66 && c <= 0x0D6F ) ||
- (c >= 0x0E50 && c <= 0x0E59 ) ||
- (c >= 0x0ED0 && c <= 0x0ED9 ) ||
- (c >= 0x0F20 && c <= 0x0F29 ));
-}
-
-
-int helper_unicode_is_extender(long c)
-{
- /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Extender */
- return((c == 0x00B7) ||
- (c == 0x02D0) ||
- (c == 0x02D1) ||
- (c == 0x0387) ||
- (c == 0x0640) ||
- (c == 0x0E46) ||
- (c == 0x0EC6) ||
- (c == 0x3005) ||
- (c >= 0x3031 && c <= 0x3035 ) ||
- (c >= 0x309D && c <= 0x309E ) ||
- (c >= 0x30FC && c <= 0x30FE ));
-}
-
-
-int helper_unicode_is_space(long c)
-{
- return((c == 0x0020) || /* Space */
- (c == 0x000C) || /* Page jump: \f */
- (c == 0x000D) || /* Carriage return: \r */
- (c == 0x000A) || /* Next line: \n */
- (c == 0x0009) || /* Horizontal tab: \t */
- (c == 0x000B) ); /* Vertical tab \v */
-}
-
-
-/**
- * helper_utf8_is_nfc:
- * @input: UTF-8 string
- * @length: length of string
- *
- * Check a string is in Unicode Normal Form C.
- *
- * Return value: Non 0 if the string is NFC
- **/
-int helper_utf8_is_nfc(const unsigned char *input, size_t length)
-{
- unsigned int i;
- int plain=1;
-
- for(i=0; i<length; i++)
- if(input[i]>0x7f) {
- plain=0;
- break;
- }
-
- if(plain)
- return 1;
-
-#ifdef helper_NFC_CHECK
- return helper_nfc_check(input, length, NULL);
-#else
- return 1;
-#endif
-}
-
-
-/**
- * helper_utf8_check:
- * @string: UTF-8 string
- * @length: length of string
- *
- * Check a string is UTF-8.
- *
- * Return value: Non 0 if the string is UTF-8
- **/
-int helper_utf8_check(const unsigned char *string, size_t length)
-{
- while(length > 0) {
- unsigned long unichar=0;
-
- int unichar_len=helper_utf8_to_unicode_char(&unichar, string, length);
- if(unichar_len < 0 || unichar_len > (int)length)
- return 0;
-
- if(unichar > 0x10ffff)
- return 0;
-
- string += unichar_len;
- length -= unichar_len;
- }
- return 1;
-}
diff --git a/utf8.h b/utf8.h
deleted file mode 100644
index 7d39644..0000000
--- a/utf8.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * helper_utf8.c - Raptor UTF-8 and Unicode support
- *
- * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
- * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
- *
- * This package is Free Software and part of Redland http://librdf.org/
- *
- * It is licensed under the following three licenses as alternatives:
- * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
- * 2. GNU General Public License (GPL) V2 or any newer version
- * 3. Apache License, V2.0 or any newer version
- *
- * You may not use this file except in compliance with at least one of
- * the above three licenses.
- */
-
-#ifndef _GOPLAY_UTF8_H
-#define _GOPLAY_UTF8_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-int helper_unicode_char_to_utf8(unsigned long c, unsigned char *output);
-int helper_utf8_to_unicode_char(unsigned long *output,
- const unsigned char *input, int length);
-
-int helper_unicode_is_letter(long c);
-int helper_unicode_is_basechar(long c);
-int helper_unicode_is_ideographic(long c);
-int helper_unicode_is_combiningchar(long c);
-int helper_unicode_is_digit(long c);
-int helper_unicode_is_extender(long c);
-
-int helper_unicode_is_space(long c);
-
-int helper_unicode_is_xml11_namestartchar(long c);
-int helper_unicode_is_xml10_namestartchar(long c);
-int helper_unicode_is_xml11_namechar(long c);
-int helper_unicode_is_xml10_namechar(long c);
-
-int helper_utf8_is_nfc(const unsigned char *input, size_t length);
-int helper_utf8_check(const unsigned char *string, size_t length);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // _GOPLAY_UTF8_H
diff --git a/utf8/checked.h b/utf8/checked.h
index b57c4dd..4ce6cdf 100644
--- a/utf8/checked.h
+++ b/utf8/checked.h
@@ -172,18 +172,6 @@ namespace utf8
return next(temp, end);
}
- /// Deprecated in versions that include "prior"
- template <typename octet_iterator>
- uint32_t previous(octet_iterator& it, octet_iterator pass_start)
- {
- octet_iterator end = it;
- while (internal::is_trail(*(--it)))
- if (it == pass_start)
- throw invalid_utf8(*(it+1)); // error - no lead byte in the sequence
- octet_iterator temp = it;
- return next(temp, end);
- }
-
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n, octet_iterator end)
{
@@ -205,6 +193,7 @@ namespace utf8
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
+ u16bit_iterator last = start;
uint32_t cp = internal::mask16(*start++);
// Take care of surrogate pairs first
if (internal::is_surrogate(cp)) {
@@ -215,8 +204,8 @@ namespace utf8
else
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
}
- else
- throw invalid_utf16(static_cast<uint16_t>(*(start-1)));
+ else
+ throw invalid_utf16(static_cast<uint16_t>(*last));
}
result = append(cp, result);
diff --git a/utf8/html.cpp b/utf8/html.cpp
new file mode 100644
index 0000000..0ecb308
--- /dev/null
+++ b/utf8/html.cpp
@@ -0,0 +1,380 @@
+#include "../common.h"
+#include "parser.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <iostream>
+#include <string>
+#include <map>
+
+typedef struct {
+ const char * Name;
+ unsigned int Char;
+ int UTF8Size;
+ unsigned long int UTF8;
+} HTMLEntityDataType;
+
+#define UTF8SIZE(Ch) ( (Ch >= 0x800) ? 3 : ( (Ch >= 0x80) ? 2 : 1 ) )
+#define UTF8BYTE1(Ch) ( (Ch >= 0x800) ? (0xE0 | (Ch >> 12 & 0x0F)) : ( (Ch >= 0x80) ? (0xC0 | (Ch >> 6 & 0x1F)) : Ch ) )
+#define UTF8BYTE2(Ch) ( (Ch >= 0x800) ? (0x80 | (Ch >> 6 & 0x3F)) : ( (Ch >= 0x80) ? (0x80 | (Ch & 0x3F)) : 0 ) )
+#define UTF8BYTE3(Ch) ( (Ch >= 0x800) ? (0x80 | (Ch & 0x3F)) : ( (Ch >= 0x80) ? 0 : 0 ) )
+#define UTF8ENTITY(Name, Ch) { Name, Ch, UTF8SIZE(Ch), UTF8BYTE1(Ch) + (UTF8BYTE2(Ch) << 8) + (UTF8BYTE3(Ch) << 16) }
+
+static const HTMLEntityDataType HTMLEntityData[] =
+{ // List of entities defined in the HTML 4.0 spec
+ UTF8ENTITY("nbsp", 160),
+ UTF8ENTITY("iexcl", 161),
+ UTF8ENTITY("cent", 162),
+ UTF8ENTITY("pound", 163),
+ UTF8ENTITY("curren", 164),
+ UTF8ENTITY("yen", 165),
+ UTF8ENTITY("brvbar", 166),
+ UTF8ENTITY("sect", 167),
+ UTF8ENTITY("uml", 168),
+ UTF8ENTITY("copy", 169),
+ UTF8ENTITY("ordf", 170),
+ UTF8ENTITY("laquo", 171),
+ UTF8ENTITY("not", 172),
+ UTF8ENTITY("shy", 173),
+ UTF8ENTITY("reg", 174),
+ UTF8ENTITY("macr", 175),
+ UTF8ENTITY("deg", 176),
+ UTF8ENTITY("plusmn", 177),
+ UTF8ENTITY("sup2", 178),
+ UTF8ENTITY("sup3", 179),
+ UTF8ENTITY("acute", 180),
+ UTF8ENTITY("micro", 181),
+ UTF8ENTITY("para", 182),
+ UTF8ENTITY("middot", 183),
+ UTF8ENTITY("cedil", 184),
+ UTF8ENTITY("sup1", 185),
+ UTF8ENTITY("ordm", 186),
+ UTF8ENTITY("raquo", 187),
+ UTF8ENTITY("frac14", 188),
+ UTF8ENTITY("frac12", 189),
+ UTF8ENTITY("frac34", 190),
+ UTF8ENTITY("iquest", 191),
+ UTF8ENTITY("Agrave", 192),
+ UTF8ENTITY("Aacute", 193),
+ UTF8ENTITY("Acirc", 194),
+ UTF8ENTITY("Atilde", 195),
+ UTF8ENTITY("Auml", 196),
+ UTF8ENTITY("Aring", 197),
+ UTF8ENTITY("AElig", 198),
+ UTF8ENTITY("Ccedil", 199),
+ UTF8ENTITY("Egrave", 200),
+ UTF8ENTITY("Eacute", 201),
+ UTF8ENTITY("Ecirc", 202),
+ UTF8ENTITY("Euml", 203),
+ UTF8ENTITY("Igrave", 204),
+ UTF8ENTITY("Iacute", 205),
+ UTF8ENTITY("Icirc", 206),
+ UTF8ENTITY("Iuml", 207),
+ UTF8ENTITY("ETH", 208),
+ UTF8ENTITY("Ntilde", 209),
+ UTF8ENTITY("Ograve", 210),
+ UTF8ENTITY("Oacute", 211),
+ UTF8ENTITY("Ocirc", 212),
+ UTF8ENTITY("Otilde", 213),
+ UTF8ENTITY("Ouml", 214),
+ UTF8ENTITY("times", 215),
+ UTF8ENTITY("Oslash", 216),
+ UTF8ENTITY("Ugrave", 217),
+ UTF8ENTITY("Uacute", 218),
+ UTF8ENTITY("Ucirc", 219),
+ UTF8ENTITY("Uuml", 220),
+ UTF8ENTITY("Yacute", 221),
+ UTF8ENTITY("THORN", 222),
+ UTF8ENTITY("szlig", 223),
+ UTF8ENTITY("agrave", 224),
+ UTF8ENTITY("aacute", 225),
+ UTF8ENTITY("acirc", 226),
+ UTF8ENTITY("atilde", 227),
+ UTF8ENTITY("auml", 228),
+ UTF8ENTITY("aring", 229),
+ UTF8ENTITY("aelig", 230),
+ UTF8ENTITY("ccedil", 231),
+ UTF8ENTITY("egrave", 232),
+ UTF8ENTITY("eacute", 233),
+ UTF8ENTITY("ecirc", 234),
+ UTF8ENTITY("euml", 235),
+ UTF8ENTITY("igrave", 236),
+ UTF8ENTITY("iacute", 237),
+ UTF8ENTITY("icirc", 238),
+ UTF8ENTITY("iuml", 239),
+ UTF8ENTITY("eth", 240),
+ UTF8ENTITY("ntilde", 241),
+ UTF8ENTITY("ograve", 242),
+ UTF8ENTITY("oacute", 243),
+ UTF8ENTITY("ocirc", 244),
+ UTF8ENTITY("otilde", 245),
+ UTF8ENTITY("ouml", 246),
+ UTF8ENTITY("divide", 247),
+ UTF8ENTITY("oslash", 248),
+ UTF8ENTITY("ugrave", 249),
+ UTF8ENTITY("uacute", 250),
+ UTF8ENTITY("ucirc", 251),
+ UTF8ENTITY("uuml", 252),
+ UTF8ENTITY("yacute", 253),
+ UTF8ENTITY("thorn", 254),
+ UTF8ENTITY("yuml", 255),
+ UTF8ENTITY("fnof", 402),
+ // Greek
+ UTF8ENTITY("Alpha", 913),
+ UTF8ENTITY("Beta", 914),
+ UTF8ENTITY("Gamma", 915),
+ UTF8ENTITY("Delta", 916),
+ UTF8ENTITY("Epsilon", 917),
+ UTF8ENTITY("Zeta", 918),
+ UTF8ENTITY("Eta", 919),
+ UTF8ENTITY("Theta", 920),
+ UTF8ENTITY("Iota", 921),
+ UTF8ENTITY("Kappa", 922),
+ UTF8ENTITY("Lambda", 923),
+ UTF8ENTITY("Mu", 924),
+ UTF8ENTITY("Nu", 925),
+ UTF8ENTITY("Xi", 926),
+ UTF8ENTITY("Omicron", 927),
+ UTF8ENTITY("Pi", 928),
+ UTF8ENTITY("Rho", 929),
+ UTF8ENTITY("Sigma", 931),
+ UTF8ENTITY("Tau", 932),
+ UTF8ENTITY("Upsilon", 933),
+ UTF8ENTITY("Phi", 934),
+ UTF8ENTITY("Chi", 935),
+ UTF8ENTITY("Psi", 936),
+ UTF8ENTITY("Omega", 937),
+ UTF8ENTITY("alpha", 945),
+ UTF8ENTITY("beta", 946),
+ UTF8ENTITY("gamma", 947),
+ UTF8ENTITY("delta", 948),
+ UTF8ENTITY("epsilon", 949),
+ UTF8ENTITY("zeta", 950),
+ UTF8ENTITY("eta", 951),
+ UTF8ENTITY("theta", 952),
+ UTF8ENTITY("iota", 953),
+ UTF8ENTITY("kappa", 954),
+ UTF8ENTITY("lambda", 955),
+ UTF8ENTITY("mu", 956),
+ UTF8ENTITY("nu", 957),
+ UTF8ENTITY("xi", 958),
+ UTF8ENTITY("omicron", 959),
+ UTF8ENTITY("pi", 960),
+ UTF8ENTITY("rho", 961),
+ UTF8ENTITY("sigmaf", 962),
+ UTF8ENTITY("sigma", 963),
+ UTF8ENTITY("tau", 964),
+ UTF8ENTITY("upsilon", 965),
+ UTF8ENTITY("phi", 966),
+ UTF8ENTITY("chi", 967),
+ UTF8ENTITY("psi", 968),
+ UTF8ENTITY("omega", 969),
+ UTF8ENTITY("thetasym", 977),
+ UTF8ENTITY("upsih", 978),
+ UTF8ENTITY("piv", 982),
+ // General Punctuation
+ UTF8ENTITY("bull", 8226),
+ UTF8ENTITY("hellip", 8230),
+ UTF8ENTITY("prime", 8242),
+ UTF8ENTITY("Prime", 8243),
+ UTF8ENTITY("oline", 8254),
+ UTF8ENTITY("frasl", 8260),
+ // Letterlike Symbols
+ UTF8ENTITY("weierp", 8472),
+ UTF8ENTITY("image", 8465),
+ UTF8ENTITY("real", 8476),
+ UTF8ENTITY("trade", 8482),
+ UTF8ENTITY("alefsym", 8501),
+ // Arrows
+ UTF8ENTITY("larr", 8592),
+ UTF8ENTITY("uarr", 8593),
+ UTF8ENTITY("rarr", 8594),
+ UTF8ENTITY("darr", 8595),
+ UTF8ENTITY("harr", 8596),
+ UTF8ENTITY("crarr", 8629),
+ UTF8ENTITY("lArr", 8656),
+ UTF8ENTITY("uArr", 8657),
+ UTF8ENTITY("rArr", 8658),
+ UTF8ENTITY("dArr", 8659),
+ UTF8ENTITY("hArr", 8660),
+ // Mathematical Operators
+ UTF8ENTITY("forall", 8704),
+ UTF8ENTITY("part", 8706),
+ UTF8ENTITY("exist", 8707),
+ UTF8ENTITY("empty", 8709),
+ UTF8ENTITY("nabla", 8711),
+ UTF8ENTITY("isin", 8712),
+ UTF8ENTITY("notin", 8713),
+ UTF8ENTITY("ni", 8715),
+ UTF8ENTITY("prod", 8719),
+ UTF8ENTITY("sum", 8721),
+ UTF8ENTITY("minus", 8722),
+ UTF8ENTITY("lowast", 8727),
+ UTF8ENTITY("radic", 8730),
+ UTF8ENTITY("prop", 8733),
+ UTF8ENTITY("infin", 8734),
+ UTF8ENTITY("and", 8743),
+ UTF8ENTITY("or", 8744),
+ UTF8ENTITY("cap", 8745),
+ UTF8ENTITY("cup", 8746),
+ UTF8ENTITY("int", 8747),
+ UTF8ENTITY("there4", 8756),
+ UTF8ENTITY("sim", 8764),
+ UTF8ENTITY("cong", 8773),
+ UTF8ENTITY("asymp", 8776),
+ UTF8ENTITY("ne", 8800),
+ UTF8ENTITY("equiv", 8801),
+ UTF8ENTITY("le", 8804),
+ UTF8ENTITY("ge", 8805),
+ UTF8ENTITY("sub", 8834),
+ UTF8ENTITY("sup", 8835),
+ UTF8ENTITY("nsub", 8836),
+ UTF8ENTITY("sube", 8838),
+ UTF8ENTITY("supe", 8839),
+ UTF8ENTITY("oplus", 8853),
+ UTF8ENTITY("otimes", 8855),
+ UTF8ENTITY("perp", 8869),
+ UTF8ENTITY("sdot", 8901),
+ // Miscellaneous Technical
+ UTF8ENTITY("lceil", 8968),
+ UTF8ENTITY("rceil", 8969),
+ UTF8ENTITY("lfloor", 8970),
+ UTF8ENTITY("rfloor", 8971),
+ UTF8ENTITY("lang", 9001),
+ UTF8ENTITY("rang", 9002),
+ // Geometric Shapes
+ UTF8ENTITY("loz", 9674),
+ // Miscellaneous Symbols
+ UTF8ENTITY("spades", 9824),
+ UTF8ENTITY("clubs", 9827),
+ UTF8ENTITY("hearts", 9829),
+ UTF8ENTITY("diams", 9830),
+ UTF8ENTITY("quot", 34),
+ UTF8ENTITY("amp", 38),
+ UTF8ENTITY("lt", 60),
+ UTF8ENTITY("gt", 62),
+ // Latin Extended-A
+ UTF8ENTITY("OElig", 338),
+ UTF8ENTITY("oelig", 339),
+ UTF8ENTITY("Scaron", 352),
+ UTF8ENTITY("scaron", 353),
+ UTF8ENTITY("Yuml", 376),
+ // Spacing Modifier Letters
+ UTF8ENTITY("circ", 710),
+ UTF8ENTITY("tilde", 732),
+ // General Punctuation
+ UTF8ENTITY("ensp", 8194),
+ UTF8ENTITY("emsp", 8195),
+ UTF8ENTITY("thinsp", 8201),
+ UTF8ENTITY("zwnj", 8204),
+ UTF8ENTITY("zwj", 8205),
+ UTF8ENTITY("lrm", 8206),
+ UTF8ENTITY("rlm", 8207),
+ UTF8ENTITY("ndash", 8211),
+ UTF8ENTITY("mdash", 8212),
+ UTF8ENTITY("lsquo", 8216),
+ UTF8ENTITY("rsquo", 8217),
+ UTF8ENTITY("sbquo", 8218),
+ UTF8ENTITY("ldquo", 8220),
+ UTF8ENTITY("rdquo", 8221),
+ UTF8ENTITY("bdquo", 8222),
+ UTF8ENTITY("dagger", 8224),
+ UTF8ENTITY("Dagger", 8225),
+ UTF8ENTITY("permil", 8240),
+ UTF8ENTITY("lsaquo", 8249),
+ UTF8ENTITY("rsaquo", 8250),
+ UTF8ENTITY("euro", 8364),
+ UTF8ENTITY(NULL, 0) // End of the list
+}; // End of HTMLEntityData
+
+class Char2UTF8Map : std::map<unsigned long int, std::string>
+{
+public:
+ typedef std::map<unsigned long int, std::string>::iterator iterator;
+ typedef std::pair<unsigned long int, std::string> pair;
+ Char2UTF8Map(const HTMLEntityDataType *entities)
+ {
+ const HTMLEntityDataType *ent = HTMLEntityData;
+ while (ent->Name)
+ {
+ insert ( pair(ent->Char,ent->Name) );
+ ent++;
+ }
+ }
+ inline iterator end()
+ {
+ return std::map<unsigned long int, std::string>::end();
+ }
+ inline iterator find(unsigned long int ch)
+ {
+ return std::map<unsigned long int, std::string>::find(ch);
+ }
+};
+
+static Char2UTF8Map char2utf8(HTMLEntityData);
+
+void QuoteHTML(std::ostream &out, const char *in)
+{
+ const char *ptr = in;
+ int len = strlen(in);
+ while (*ptr)
+ {
+ unsigned long int ch;
+ int adv = utf8::parser::utf8_to_char(&ch, ptr, len);
+ if (adv && len > adv)
+ {
+ Char2UTF8Map::iterator it = char2utf8.find(ch);
+ if (ch < 128)
+ {
+ out.put(*ptr);
+ }
+ else if (it != char2utf8.end())
+ {
+ out << "&" << it->second << ";";
+ }
+ else
+ {
+ out << "&#" << ch << ";";
+ }
+ len -= adv;
+ ptr += adv;
+ }
+ else
+ break;
+ }
+}
+
+
+#ifdef UNIT_TEST
+
+static uint32_t Ch2UTF8(unsigned int ch) // Writes ch in UTF-8 encoding. Only 16 bits
+{
+ if (ch >= 0x800)
+ {
+ return (0xE0 | (ch >> 12 & 0x0F)) + (0x80 | (ch >> 6 & 0x3F)) * 256 + (0x80 | (ch & 0x3F)) * 65536;
+ }
+ else if (ch >= 0x80)
+ {
+ return (0xC0 | (ch >> 6 & 0x1F)) + (0x80 | (ch & 0x3F)) * 256;
+ }
+ else
+ return ch;
+}
+
+TEST_FUNCTION TestCuUTF8HTML(CuTest* tc)
+{
+ const HTMLEntityDataType *ent = HTMLEntityData;
+ while (ent->Name)
+ {
+ uint32_t utf8 = Ch2UTF8(ent->Char);
+ CuAssertTrue(tc, memcmp(&ent->UTF8, &utf8, 3) == 0 );
+ ent++;
+ }
+ QuoteHTML(std::cout, "Hou>stño€n\n");
+ std::cout << std::endl;
+
+}
+#endif
+
diff --git a/utf8/parser.cpp b/utf8/parser.cpp
new file mode 100644
index 0000000..427c214
--- /dev/null
+++ b/utf8/parser.cpp
@@ -0,0 +1,788 @@
+/*
+ * helper_utf8.c - Raptor UTF-8 and Unicode support
+ *
+ * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
+ * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
+ * Copyright (C) 2009 Miriam Ruiz <little_miry at yahoo.es>
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ * 2. GNU General Public License (GPL) V2 or any newer version
+ * 3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ */
+
+#include "../common.h"
+#include "parser.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+namespace utf8 { namespace parser {
+
+/**
+ * char_to_utf8:
+ * @c: Unicode character
+ * @output: UTF-8 string buffer or NULL
+ *
+ * Convert a Unicode character to UTF-8 encoding.
+ *
+ * Based on librdf_unicode_char_to_utf8() with no need to calculate
+ * length since the encoded character is always copied into a buffer
+ * with sufficient size.
+ *
+ * Return value: bytes encoded to output buffer or <0 on failure
+ **/
+int char_to_utf8(unsigned long c, char *output)
+{
+ int size=0;
+
+ if (c < 0x00000080)
+ size=1;
+ else if (c < 0x00000800)
+ size=2;
+ else if (c < 0x00010000)
+ size=3;
+ else if (c < 0x00200000)
+ size=4;
+ else if (c < 0x04000000)
+ size=5;
+ else if (c < 0x80000000)
+ size=6;
+ else
+ return -1;
+
+ switch(size)
+ {
+ case 6:
+ output[5]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bit 2 (bits 7,6,5,4,3,2 less 7,6,5,4,3 set below) on last byte */
+ c |= 0x4000000; /* 0x10000 = 0x04 << 24 */
+ /* FALLTHROUGH */
+ case 5:
+ output[4]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bit 3 (bits 7,6,5,4,3 less 7,6,5,4 set below) on last byte */
+ c |= 0x200000; /* 0x10000 = 0x08 << 18 */
+ /* FALLTHROUGH */
+ case 4:
+ output[3]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
+ c |= 0x10000; /* 0x10000 = 0x10 << 12 */
+ /* FALLTHROUGH */
+ case 3:
+ output[2]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
+ c |= 0x800; /* 0x800 = 0x20 << 6 */
+ /* FALLTHROUGH */
+ case 2:
+ output[1]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bits 7,6 on last byte */
+ c |= 0xc0;
+ /* FALLTHROUGH */
+ case 1:
+ output[0]=(unsigned char)c;
+ }
+
+ return size;
+}
+
+
+/**
+ * helper_utf8_to_unicode_char:
+ * @output: Pointer to the Unicode character or NULL
+ * @input: UTF-8 string buffer
+ * @length: buffer size
+ *
+ * Convert an UTF-8 encoded buffer to a Unicode character.
+ *
+ * If output is NULL, then will calculate the number of bytes that
+ * will be used from the input buffer and not perform the conversion.
+ *
+ * Return value: bytes used from input buffer or <0 on failure:
+ * -1 input buffer too short or length error
+ * -2 overlong UTF-8 sequence
+ * -3 illegal code positions
+ * -4 code out of range U+0000 to U+10FFFF.
+ * In cases -2, -3 and -4 the coded character is stored in the output.
+ */
+int utf8_to_char(unsigned long *output, const char *input, int length)
+{
+ unsigned char in;
+ int size;
+ unsigned long c=0;
+
+ if(length < 1)
+ return -1;
+
+ in=*input++;
+ if((in & 0x80) == 0)
+ {
+ size=1;
+ c= in & 0x7f;
+ }
+ else if((in & 0xe0) == 0xc0)
+ {
+ size=2;
+ c= in & 0x1f;
+ }
+ else if((in & 0xf0) == 0xe0)
+ {
+ size=3;
+ c= in & 0x0f;
+ }
+ else if((in & 0xf8) == 0xf0)
+ {
+ size=4;
+ c = in & 0x07;
+ }
+ else if((in & 0xfc) == 0xf8)
+ {
+ size=5;
+ c = in & 0x03;
+ }
+ else if((in & 0xfe) == 0xfc)
+ {
+ size=6;
+ c = in & 0x01;
+ } else
+ return -1;
+
+ if(!output)
+ return size;
+
+ if(length < size)
+ return -1;
+
+ switch(size)
+ {
+ case 6:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 5:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 4:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 3:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 2:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ default:
+ break;
+ }
+
+ *output=c;
+
+ /* check for overlong UTF-8 sequences */
+ switch(size)
+ {
+ case 2:
+ if(c < 0x00000080)
+ return -2;
+ break;
+ case 3:
+ if(c < 0x00000800)
+ return -2;
+ break;
+ case 4:
+ if(c < 0x00010000)
+ return -2;
+ break;
+
+ default: // 1
+ break;
+ }
+
+ /* check for illegal code positions:
+ * U+D800 to U+DFFF (UTF-16 surrogates)
+ * U+FFFE and U+FFFF
+ */
+ if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
+ return -3;
+
+ /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
+ /* of course this makes some 4 byte forms illegal */
+ if(c > 0x10ffff)
+ return -4;
+
+ return size;
+}
+
+
+/**
+ * is_xml11_namestartchar:
+ * @c: Unicode character to check
+ *
+ * Check if Unicode character is legal to start an XML 1.1 Name
+ *
+ * Namespaces in XML 1.1 REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
+ * updating
+ * Extensible Markup Language (XML) 1.1 REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int is_xml11_namestartchar(long c)
+{
+ /* [A-Z] */
+ return (((c >= 0x0041) && (c <= 0x005A)) ||
+ (c == 0x005F) || /* '_' */
+ /* [a-z] */
+ ((c >= 0x0061) && (c <= 0x007A)) ||
+ ((c >= 0x00C0) && (c <= 0x00D6)) ||
+ ((c >= 0x00D8) && (c <= 0x00F6)) ||
+ ((c >= 0x00F8) && (c <= 0x02FF)) ||
+ ((c >= 0x0370) && (c <= 0x037D)) ||
+ ((c >= 0x037F) && (c <= 0x1FFF)) ||
+ ((c >= 0x200C) && (c <= 0x200D)) ||
+ ((c >= 0x2070) && (c <= 0x218F)) ||
+ ((c >= 0x2C00) && (c <= 0x2FEF)) ||
+ ((c >= 0x3001) && (c <= 0xD7FF)) ||
+ ((c >= 0xF900) && (c <= 0xFDCF)) ||
+ ((c >= 0xFDF0) && (c <= 0xFFFD)) ||
+ ((c >= 0x10000) && (c <= 0xEFFFF)));
+}
+
+
+/**
+ * is_xml10_namestartchar:
+ * @c: Unicode character to check
+ *
+ * Check if Unicode character is legal to start an XML 1.0 Name
+ *
+ * Namespaces in XML REC 1999-01-14
+ * http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
+ * updating
+ * Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml-20040204/
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int is_xml10_namestartchar(long c)
+{
+ return (is_letter(c) ||
+ (c == '_'));
+}
+
+
+/**
+ * is_xml11_namechar:
+ * @c: Unicode character
+ *
+ * Check if a Unicode codepoint is a legal to continue an XML 1.1 Name
+ *
+ * Namespaces in XML 1.1 REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml11-20040204/
+ * updating
+ * Extensible Markup Language (XML) 1.1 REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int is_xml11_namechar(long c)
+{
+ return (is_xml11_namestartchar(c) ||
+ (c == 0x002D) || /* '-' */
+ (c == 0x002E) || /* '.' */
+ /* 0-9 */
+ (c >= 0x0030 && c <= 0x0039) ||
+ (c == 0x00B7) ||
+ (c >= 0x0300 && c <=0x036F) ||
+ (c >= 0x203F && c <=0x2040));
+}
+
+
+/**
+ * is_xml10_namechar:
+ * @c: Unicode character
+ *
+ * Check if a Unicode codepoint is a legal to continue an XML 1.0 Name
+ *
+ * Namespaces in XML REC 1999-01-14
+ * http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCNameChar
+ * updating
+ * Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml-20040204/
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int is_xml10_namechar(long c)
+{
+ return (is_letter(c) ||
+ is_digit(c) ||
+ (c == 0x002E) || /* '.' */
+ (c == 0x002D) || /* '-' */
+ (c == 0x005F) || /* '_' */
+ is_combiningchar(c) ||
+ is_extender(c));
+}
+
+
+/*
+ * All this below was derived by machine-transforming the classes in Appendix B
+ * of http://www.w3.org/TR/2000/REC-xml-20001006
+ */
+
+int is_letter(long c)
+{
+ return(is_basechar(c) ||
+ is_ideographic(c));
+}
+
+
+int is_basechar(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-BaseChar */
+ return((c >= 0x0041 && c <= 0x005A ) ||
+ (c >= 0x0061 && c <= 0x007A ) ||
+ (c >= 0x00C0 && c <= 0x00D6 ) ||
+ (c >= 0x00D8 && c <= 0x00F6 ) ||
+ (c >= 0x00F8 && c <= 0x00FF ) ||
+ (c >= 0x0100 && c <= 0x0131 ) ||
+ (c >= 0x0134 && c <= 0x013E ) ||
+ (c >= 0x0141 && c <= 0x0148 ) ||
+ (c >= 0x014A && c <= 0x017E ) ||
+ (c >= 0x0180 && c <= 0x01C3 ) ||
+ (c >= 0x01CD && c <= 0x01F0 ) ||
+ (c >= 0x01F4 && c <= 0x01F5 ) ||
+ (c >= 0x01FA && c <= 0x0217 ) ||
+ (c >= 0x0250 && c <= 0x02A8 ) ||
+ (c >= 0x02BB && c <= 0x02C1 ) ||
+ (c == 0x0386) ||
+ (c >= 0x0388 && c <= 0x038A ) ||
+ (c == 0x038C) ||
+ (c >= 0x038E && c <= 0x03A1 ) ||
+ (c >= 0x03A3 && c <= 0x03CE ) ||
+ (c >= 0x03D0 && c <= 0x03D6 ) ||
+ (c == 0x03DA) ||
+ (c == 0x03DC) ||
+ (c == 0x03DE) ||
+ (c == 0x03E0) ||
+ (c >= 0x03E2 && c <= 0x03F3 ) ||
+ (c >= 0x0401 && c <= 0x040C ) ||
+ (c >= 0x040E && c <= 0x044F ) ||
+ (c >= 0x0451 && c <= 0x045C ) ||
+ (c >= 0x045E && c <= 0x0481 ) ||
+ (c >= 0x0490 && c <= 0x04C4 ) ||
+ (c >= 0x04C7 && c <= 0x04C8 ) ||
+ (c >= 0x04CB && c <= 0x04CC ) ||
+ (c >= 0x04D0 && c <= 0x04EB ) ||
+ (c >= 0x04EE && c <= 0x04F5 ) ||
+ (c >= 0x04F8 && c <= 0x04F9 ) ||
+ (c >= 0x0531 && c <= 0x0556 ) ||
+ (c == 0x0559) ||
+ (c >= 0x0561 && c <= 0x0586 ) ||
+ (c >= 0x05D0 && c <= 0x05EA ) ||
+ (c >= 0x05F0 && c <= 0x05F2 ) ||
+ (c >= 0x0621 && c <= 0x063A ) ||
+ (c >= 0x0641 && c <= 0x064A ) ||
+ (c >= 0x0671 && c <= 0x06B7 ) ||
+ (c >= 0x06BA && c <= 0x06BE ) ||
+ (c >= 0x06C0 && c <= 0x06CE ) ||
+ (c >= 0x06D0 && c <= 0x06D3 ) ||
+ (c == 0x06D5) ||
+ (c >= 0x06E5 && c <= 0x06E6 ) ||
+ (c >= 0x0905 && c <= 0x0939 ) ||
+ (c == 0x093D) ||
+ (c >= 0x0958 && c <= 0x0961 ) ||
+ (c >= 0x0985 && c <= 0x098C ) ||
+ (c >= 0x098F && c <= 0x0990 ) ||
+ (c >= 0x0993 && c <= 0x09A8 ) ||
+ (c >= 0x09AA && c <= 0x09B0 ) ||
+ (c == 0x09B2) ||
+ (c >= 0x09B6 && c <= 0x09B9 ) ||
+ (c >= 0x09DC && c <= 0x09DD ) ||
+ (c >= 0x09DF && c <= 0x09E1 ) ||
+ (c >= 0x09F0 && c <= 0x09F1 ) ||
+ (c >= 0x0A05 && c <= 0x0A0A ) ||
+ (c >= 0x0A0F && c <= 0x0A10 ) ||
+ (c >= 0x0A13 && c <= 0x0A28 ) ||
+ (c >= 0x0A2A && c <= 0x0A30 ) ||
+ (c >= 0x0A32 && c <= 0x0A33 ) ||
+ (c >= 0x0A35 && c <= 0x0A36 ) ||
+ (c >= 0x0A38 && c <= 0x0A39 ) ||
+ (c >= 0x0A59 && c <= 0x0A5C ) ||
+ (c == 0x0A5E) ||
+ (c >= 0x0A72 && c <= 0x0A74 ) ||
+ (c >= 0x0A85 && c <= 0x0A8B ) ||
+ (c == 0x0A8D) ||
+ (c >= 0x0A8F && c <= 0x0A91 ) ||
+ (c >= 0x0A93 && c <= 0x0AA8 ) ||
+ (c >= 0x0AAA && c <= 0x0AB0 ) ||
+ (c >= 0x0AB2 && c <= 0x0AB3 ) ||
+ (c >= 0x0AB5 && c <= 0x0AB9 ) ||
+ (c == 0x0ABD) ||
+ (c == 0x0AE0) ||
+ (c >= 0x0B05 && c <= 0x0B0C ) ||
+ (c >= 0x0B0F && c <= 0x0B10 ) ||
+ (c >= 0x0B13 && c <= 0x0B28 ) ||
+ (c >= 0x0B2A && c <= 0x0B30 ) ||
+ (c >= 0x0B32 && c <= 0x0B33 ) ||
+ (c >= 0x0B36 && c <= 0x0B39 ) ||
+ (c == 0x0B3D) ||
+ (c >= 0x0B5C && c <= 0x0B5D ) ||
+ (c >= 0x0B5F && c <= 0x0B61 ) ||
+ (c >= 0x0B85 && c <= 0x0B8A ) ||
+ (c >= 0x0B8E && c <= 0x0B90 ) ||
+ (c >= 0x0B92 && c <= 0x0B95 ) ||
+ (c >= 0x0B99 && c <= 0x0B9A ) ||
+ (c == 0x0B9C) ||
+ (c >= 0x0B9E && c <= 0x0B9F ) ||
+ (c >= 0x0BA3 && c <= 0x0BA4 ) ||
+ (c >= 0x0BA8 && c <= 0x0BAA ) ||
+ (c >= 0x0BAE && c <= 0x0BB5 ) ||
+ (c >= 0x0BB7 && c <= 0x0BB9 ) ||
+ (c >= 0x0C05 && c <= 0x0C0C ) ||
+ (c >= 0x0C0E && c <= 0x0C10 ) ||
+ (c >= 0x0C12 && c <= 0x0C28 ) ||
+ (c >= 0x0C2A && c <= 0x0C33 ) ||
+ (c >= 0x0C35 && c <= 0x0C39 ) ||
+ (c >= 0x0C60 && c <= 0x0C61 ) ||
+ (c >= 0x0C85 && c <= 0x0C8C ) ||
+ (c >= 0x0C8E && c <= 0x0C90 ) ||
+ (c >= 0x0C92 && c <= 0x0CA8 ) ||
+ (c >= 0x0CAA && c <= 0x0CB3 ) ||
+ (c >= 0x0CB5 && c <= 0x0CB9 ) ||
+ (c == 0x0CDE) ||
+ (c >= 0x0CE0 && c <= 0x0CE1 ) ||
+ (c >= 0x0D05 && c <= 0x0D0C ) ||
+ (c >= 0x0D0E && c <= 0x0D10 ) ||
+ (c >= 0x0D12 && c <= 0x0D28 ) ||
+ (c >= 0x0D2A && c <= 0x0D39 ) ||
+ (c >= 0x0D60 && c <= 0x0D61 ) ||
+ (c >= 0x0E01 && c <= 0x0E2E ) ||
+ (c == 0x0E30) ||
+ (c >= 0x0E32 && c <= 0x0E33 ) ||
+ (c >= 0x0E40 && c <= 0x0E45 ) ||
+ (c >= 0x0E81 && c <= 0x0E82 ) ||
+ (c == 0x0E84) ||
+ (c >= 0x0E87 && c <= 0x0E88 ) ||
+ (c == 0x0E8A) ||
+ (c == 0x0E8D) ||
+ (c >= 0x0E94 && c <= 0x0E97 ) ||
+ (c >= 0x0E99 && c <= 0x0E9F ) ||
+ (c >= 0x0EA1 && c <= 0x0EA3 ) ||
+ (c == 0x0EA5) ||
+ (c == 0x0EA7) ||
+ (c >= 0x0EAA && c <= 0x0EAB ) ||
+ (c >= 0x0EAD && c <= 0x0EAE ) ||
+ (c == 0x0EB0) ||
+ (c >= 0x0EB2 && c <= 0x0EB3 ) ||
+ (c == 0x0EBD) ||
+ (c >= 0x0EC0 && c <= 0x0EC4 ) ||
+ (c >= 0x0F40 && c <= 0x0F47 ) ||
+ (c >= 0x0F49 && c <= 0x0F69 ) ||
+ (c >= 0x10A0 && c <= 0x10C5 ) ||
+ (c >= 0x10D0 && c <= 0x10F6 ) ||
+ (c == 0x1100) ||
+ (c >= 0x1102 && c <= 0x1103 ) ||
+ (c >= 0x1105 && c <= 0x1107 ) ||
+ (c == 0x1109) ||
+ (c >= 0x110B && c <= 0x110C ) ||
+ (c >= 0x110E && c <= 0x1112 ) ||
+ (c == 0x113C) ||
+ (c == 0x113E) ||
+ (c == 0x1140) ||
+ (c == 0x114C) ||
+ (c == 0x114E) ||
+ (c == 0x1150) ||
+ (c >= 0x1154 && c <= 0x1155 ) ||
+ (c == 0x1159) ||
+ (c >= 0x115F && c <= 0x1161 ) ||
+ (c == 0x1163) ||
+ (c == 0x1165) ||
+ (c == 0x1167) ||
+ (c == 0x1169) ||
+ (c >= 0x116D && c <= 0x116E ) ||
+ (c >= 0x1172 && c <= 0x1173 ) ||
+ (c == 0x1175) ||
+ (c == 0x119E) ||
+ (c == 0x11A8) ||
+ (c == 0x11AB) ||
+ (c >= 0x11AE && c <= 0x11AF ) ||
+ (c >= 0x11B7 && c <= 0x11B8 ) ||
+ (c == 0x11BA) ||
+ (c >= 0x11BC && c <= 0x11C2 ) ||
+ (c == 0x11EB) ||
+ (c == 0x11F0) ||
+ (c == 0x11F9) ||
+ (c >= 0x1E00 && c <= 0x1E9B ) ||
+ (c >= 0x1EA0 && c <= 0x1EF9 ) ||
+ (c >= 0x1F00 && c <= 0x1F15 ) ||
+ (c >= 0x1F18 && c <= 0x1F1D ) ||
+ (c >= 0x1F20 && c <= 0x1F45 ) ||
+ (c >= 0x1F48 && c <= 0x1F4D ) ||
+ (c >= 0x1F50 && c <= 0x1F57 ) ||
+ (c == 0x1F59) ||
+ (c == 0x1F5B) ||
+ (c == 0x1F5D) ||
+ (c >= 0x1F5F && c <= 0x1F7D ) ||
+ (c >= 0x1F80 && c <= 0x1FB4 ) ||
+ (c >= 0x1FB6 && c <= 0x1FBC ) ||
+ (c == 0x1FBE) ||
+ (c >= 0x1FC2 && c <= 0x1FC4 ) ||
+ (c >= 0x1FC6 && c <= 0x1FCC ) ||
+ (c >= 0x1FD0 && c <= 0x1FD3 ) ||
+ (c >= 0x1FD6 && c <= 0x1FDB ) ||
+ (c >= 0x1FE0 && c <= 0x1FEC ) ||
+ (c >= 0x1FF2 && c <= 0x1FF4 ) ||
+ (c >= 0x1FF6 && c <= 0x1FFC ) ||
+ (c == 0x2126) ||
+ (c >= 0x212A && c <= 0x212B ) ||
+ (c == 0x212E) ||
+ (c >= 0x2180 && c <= 0x2182 ) ||
+ (c >= 0x3041 && c <= 0x3094 ) ||
+ (c >= 0x30A1 && c <= 0x30FA ) ||
+ (c >= 0x3105 && c <= 0x312C ) ||
+ (c >= 0xAC00 && c <= 0xD7A3 )
+ );
+}
+
+
+int is_ideographic(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Ideographic */
+ return((c >= 0x4E00 && c <= 0x9FA5 ) ||
+ (c == 0x3007) ||
+ (c >= 0x3021 && c <= 0x3029 ));
+}
+
+
+int is_combiningchar(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-CombiningChar */
+ return((c >= 0x0300 && c <= 0x0345 ) ||
+ (c >= 0x0360 && c <= 0x0361 ) ||
+ (c >= 0x0483 && c <= 0x0486 ) ||
+ (c >= 0x0591 && c <= 0x05A1 ) ||
+ (c >= 0x05A3 && c <= 0x05B9 ) ||
+ (c >= 0x05BB && c <= 0x05BD ) ||
+ (c == 0x05BF) ||
+ (c >= 0x05C1 && c <= 0x05C2 ) ||
+ (c == 0x05C4) ||
+ (c >= 0x064B && c <= 0x0652 ) ||
+ (c == 0x0670) ||
+ (c >= 0x06D6 && c <= 0x06DC ) ||
+ (c >= 0x06DD && c <= 0x06DF ) ||
+ (c >= 0x06E0 && c <= 0x06E4 ) ||
+ (c >= 0x06E7 && c <= 0x06E8 ) ||
+ (c >= 0x06EA && c <= 0x06ED ) ||
+ (c >= 0x0901 && c <= 0x0903 ) ||
+ (c == 0x093C) ||
+ (c >= 0x093E && c <= 0x094C ) ||
+ (c == 0x094D) ||
+ (c >= 0x0951 && c <= 0x0954 ) ||
+ (c >= 0x0962 && c <= 0x0963 ) ||
+ (c >= 0x0981 && c <= 0x0983 ) ||
+ (c == 0x09BC) ||
+ (c == 0x09BE) ||
+ (c == 0x09BF) ||
+ (c >= 0x09C0 && c <= 0x09C4 ) ||
+ (c >= 0x09C7 && c <= 0x09C8 ) ||
+ (c >= 0x09CB && c <= 0x09CD ) ||
+ (c == 0x09D7) ||
+ (c >= 0x09E2 && c <= 0x09E3 ) ||
+ (c == 0x0A02) ||
+ (c == 0x0A3C) ||
+ (c == 0x0A3E) ||
+ (c == 0x0A3F) ||
+ (c >= 0x0A40 && c <= 0x0A42 ) ||
+ (c >= 0x0A47 && c <= 0x0A48 ) ||
+ (c >= 0x0A4B && c <= 0x0A4D ) ||
+ (c >= 0x0A70 && c <= 0x0A71 ) ||
+ (c >= 0x0A81 && c <= 0x0A83 ) ||
+ (c == 0x0ABC) ||
+ (c >= 0x0ABE && c <= 0x0AC5 ) ||
+ (c >= 0x0AC7 && c <= 0x0AC9 ) ||
+ (c >= 0x0ACB && c <= 0x0ACD ) ||
+ (c >= 0x0B01 && c <= 0x0B03 ) ||
+ (c == 0x0B3C) ||
+ (c >= 0x0B3E && c <= 0x0B43 ) ||
+ (c >= 0x0B47 && c <= 0x0B48 ) ||
+ (c >= 0x0B4B && c <= 0x0B4D ) ||
+ (c >= 0x0B56 && c <= 0x0B57 ) ||
+ (c >= 0x0B82 && c <= 0x0B83 ) ||
+ (c >= 0x0BBE && c <= 0x0BC2 ) ||
+ (c >= 0x0BC6 && c <= 0x0BC8 ) ||
+ (c >= 0x0BCA && c <= 0x0BCD ) ||
+ (c == 0x0BD7) ||
+ (c >= 0x0C01 && c <= 0x0C03 ) ||
+ (c >= 0x0C3E && c <= 0x0C44 ) ||
+ (c >= 0x0C46 && c <= 0x0C48 ) ||
+ (c >= 0x0C4A && c <= 0x0C4D ) ||
+ (c >= 0x0C55 && c <= 0x0C56 ) ||
+ (c >= 0x0C82 && c <= 0x0C83 ) ||
+ (c >= 0x0CBE && c <= 0x0CC4 ) ||
+ (c >= 0x0CC6 && c <= 0x0CC8 ) ||
+ (c >= 0x0CCA && c <= 0x0CCD ) ||
+ (c >= 0x0CD5 && c <= 0x0CD6 ) ||
+ (c >= 0x0D02 && c <= 0x0D03 ) ||
+ (c >= 0x0D3E && c <= 0x0D43 ) ||
+ (c >= 0x0D46 && c <= 0x0D48 ) ||
+ (c >= 0x0D4A && c <= 0x0D4D ) ||
+ (c == 0x0D57) ||
+ (c == 0x0E31) ||
+ (c >= 0x0E34 && c <= 0x0E3A ) ||
+ (c >= 0x0E47 && c <= 0x0E4E ) ||
+ (c == 0x0EB1) ||
+ (c >= 0x0EB4 && c <= 0x0EB9 ) ||
+ (c >= 0x0EBB && c <= 0x0EBC ) ||
+ (c >= 0x0EC8 && c <= 0x0ECD ) ||
+ (c >= 0x0F18 && c <= 0x0F19 ) ||
+ (c == 0x0F35) ||
+ (c == 0x0F37) ||
+ (c == 0x0F39) ||
+ (c == 0x0F3E) ||
+ (c == 0x0F3F) ||
+ (c >= 0x0F71 && c <= 0x0F84 ) ||
+ (c >= 0x0F86 && c <= 0x0F8B ) ||
+ (c >= 0x0F90 && c <= 0x0F95 ) ||
+ (c == 0x0F97) ||
+ (c >= 0x0F99 && c <= 0x0FAD ) ||
+ (c >= 0x0FB1 && c <= 0x0FB7 ) ||
+ (c == 0x0FB9) ||
+ (c >= 0x20D0 && c <= 0x20DC ) ||
+ (c == 0x20E1) ||
+ (c >= 0x302A && c <= 0x302F ) ||
+ (c == 0x3099) ||
+ (c == 0x309A));
+}
+
+
+int is_digit(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
+ return((c >= 0x0030 && c <= 0x0039 ) ||
+ (c >= 0x0660 && c <= 0x0669 ) ||
+ (c >= 0x06F0 && c <= 0x06F9 ) ||
+ (c >= 0x0966 && c <= 0x096F ) ||
+ (c >= 0x09E6 && c <= 0x09EF ) ||
+ (c >= 0x0A66 && c <= 0x0A6F ) ||
+ (c >= 0x0AE6 && c <= 0x0AEF ) ||
+ (c >= 0x0B66 && c <= 0x0B6F ) ||
+ (c >= 0x0BE7 && c <= 0x0BEF ) ||
+ (c >= 0x0C66 && c <= 0x0C6F ) ||
+ (c >= 0x0CE6 && c <= 0x0CEF ) ||
+ (c >= 0x0D66 && c <= 0x0D6F ) ||
+ (c >= 0x0E50 && c <= 0x0E59 ) ||
+ (c >= 0x0ED0 && c <= 0x0ED9 ) ||
+ (c >= 0x0F20 && c <= 0x0F29 ));
+}
+
+
+int is_extender(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Extender */
+ return((c == 0x00B7) ||
+ (c == 0x02D0) ||
+ (c == 0x02D1) ||
+ (c == 0x0387) ||
+ (c == 0x0640) ||
+ (c == 0x0E46) ||
+ (c == 0x0EC6) ||
+ (c == 0x3005) ||
+ (c >= 0x3031 && c <= 0x3035 ) ||
+ (c >= 0x309D && c <= 0x309E ) ||
+ (c >= 0x30FC && c <= 0x30FE ));
+}
+
+
+int is_space(long c)
+{
+ return((c == 0x0020) || // Space
+ (c == 0x000C) || // Page jump: \f
+ (c == 0x000D) || // Carriage return: \r
+ (c == 0x000A) || // Next line: \n
+ (c == 0x0009) || // Horizontal tab: \t
+ (c == 0x000B) ); // Vertical tab \v
+}
+
+
+/**
+ * helper_utf8_is_nfc:
+ * @input: UTF-8 string
+ * @length: length of string
+ *
+ * Check a string is in Unicode Normal Form C.
+ *
+ * Return value: Non 0 if the string is NFC
+ **/
+int is_nfc(const char *input, size_t length)
+{
+ unsigned int i;
+ int plain=1;
+
+ for(i=0; i<length; i++)
+ if(input[i]>0x7f)
+ {
+ plain=0;
+ break;
+ }
+
+ if(plain)
+ return 1;
+
+ #ifdef helper_NFC_CHECK
+ return helper_nfc_check(input, length, NULL);
+ #else
+ return 1;
+ #endif
+}
+
+
+/**
+ * helper_utf8_check:
+ * @string: UTF-8 string
+ * @length: length of string
+ *
+ * Check a string is UTF-8.
+ *
+ * Return value: Non 0 if the string is UTF-8
+ **/
+int check(const char *string, size_t length)
+{
+ while(length > 0)
+ {
+ unsigned long unichar=0;
+
+ int unichar_len = utf8_to_char(&unichar, string, length);
+ if(unichar_len < 0 || unichar_len > (int)length)
+ return 0;
+
+ if(unichar > 0x10ffff)
+ return 0;
+
+ string += unichar_len;
+ length -= unichar_len;
+ }
+ return 1;
+}
+
+} } // Close namespaces
diff --git a/utf8/parser.h b/utf8/parser.h
new file mode 100644
index 0000000..19d0153
--- /dev/null
+++ b/utf8/parser.h
@@ -0,0 +1,51 @@
+/*
+ * helper_utf8.c - Raptor UTF-8 and Unicode support
+ *
+ * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
+ * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
+ * Copyright (C) 2009 Miriam Ruiz <little_miry at yahoo.es>
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ * 2. GNU General Public License (GPL) V2 or any newer version
+ * 3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ */
+
+#ifndef _GOPLAY_UTF8_PARSER_H
+#define _GOPLAY_UTF8_PARSER_H
+
+#include <cstdlib>
+
+namespace utf8
+{
+ namespace parser
+ {
+
+ int char_to_utf8(unsigned long c, char *output);
+ int utf8_to_char(unsigned long *output, const char *input, int length);
+
+ int is_letter(long c);
+ int is_basechar(long c);
+ int is_ideographic(long c);
+ int is_combiningchar(long c);
+ int is_digit(long c);
+ int is_extender(long c);
+
+ int is_space(long c);
+
+ int is_xml11_namestartchar(long c);
+ int is_xml10_namestartchar(long c);
+ int is_xml11_namechar(long c);
+ int is_xml10_namechar(long c);
+
+ int is_nfc(const char *input, size_t length);
+ int check(const char *string, size_t length);
+
+ } // namespace utf8::parser
+} // namespace utf8
+#endif // _GOPLAY_UTF8_PARSER_H
diff --git a/utf8/testutf8.cpp b/utf8/testutf8.cpp
index 6197bf5..25f7ba0 100644
--- a/utf8/testutf8.cpp
+++ b/utf8/testutf8.cpp
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2009 Miriam Ruiz <little_miry at yahoo.es>
- * Copyright (C) 2006 Nemanja Trifunovic
+ * Copyright (C) 2009 Miriam Ruiz <little_miry at yahoo.es>
+ * Copyright (C) 2006 Nemanja Trifunovic
*
* http://utfcpp.sourceforge.net/
* http://sourceforge.net/projects/utfcpp
@@ -74,15 +74,6 @@ TEST_FUNCTION TestCuUTF8CPP(CuTest* tc)
CuAssertTrue(tc, w == twochars);
}
- // utf8::previous (deprecated)
- {
- const char *twochars = "\xe6\x97\xa5\xd1\x88";
- const char* w = twochars + 3;
- int cp = utf8::previous (w, twochars - 1);
- CuAssertTrue(tc, cp == 0x65e5);
- CuAssertTrue(tc, w == twochars);
- }
-
// utf8::advance
{
const char *twochars = "\xe6\x97\xa5\xd1\x88";
@@ -223,15 +214,6 @@ TEST_FUNCTION TestCuUTF8CPP(CuTest* tc)
CuAssertTrue(tc, w == twochars);
}
- // utf8::unchecked::previous (deprecated)
- {
- const char *twochars = "\xe6\x97\xa5\xd1\x88";
- const char *w = twochars + 3;
- int cp = utf8::unchecked::previous (w);
- CuAssertTrue(tc, cp == 0x65e5);
- CuAssertTrue(tc, w == twochars);
- }
-
// utf8::unchecked::advance
{
const char *twochars = "\xe6\x97\xa5\xd1\x88";
diff --git a/utf8/unchecked.h b/utf8/unchecked.h
index 918e6fe..882960c 100644
--- a/utf8/unchecked.h
+++ b/utf8/unchecked.h
@@ -105,13 +105,6 @@ namespace utf8
return next(temp);
}
- // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
- template <typename octet_iterator>
- inline uint32_t previous(octet_iterator& it)
- {
- return prior(it);
- }
-
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n)
{
diff --git a/utf8/utf8cpp.html b/utf8/utf8cpp.html
index 4ad7e10..a7bb0c0 100644
--- a/utf8/utf8cpp.html
+++ b/utf8/utf8cpp.html
@@ -295,7 +295,7 @@ assert (w == twochars + <span class="literal">3</span>);
This function is typically used to iterate through a UTF-8 encoded string.
</p>
<p>
- In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
+ In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
thrown.
</p>
<h4>
@@ -335,7 +335,7 @@ assert (cp == <span class="literal">0x65e5</span>);
assert (w == twochars);
</pre>
<p>
- In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
+ In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
thrown.
</p>
<h4>
@@ -345,7 +345,7 @@ assert (w == twochars);
Available in version 1.02 and later.
</p>
<p>
- Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
</p>
@@ -403,7 +403,7 @@ assert (w == twochars);
Deprecated in version 1.02 and later.
</p>
<p>
- Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
</p>
@@ -508,7 +508,7 @@ assert (w == twochars + <span class="literal">5</span>);
Available in version 1.0 and later.
</p>
<p>
- Given the iterators to two UTF-8 encoded code points in a seqence, returns the
+ Given the iterators to two UTF-8 encoded code points in a sequence, returns the
number of code points between them.
</p>
<pre>
@@ -543,8 +543,8 @@ assert (dist == <span class="literal">2</span>);
it looked better to model it after <code>std::distance</code> algorithm.
</p>
<p>
- In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
- thrown. If <code>last</code> does not point to the past-of-end of a UTF-8 seqence,
+ In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
+ thrown. If <code>last</code> does not point to the past-of-end of a UTF-8 sequence,
a <code>utf8::not_enough_room</code> exception is thrown.
</p>
<h4>
@@ -630,8 +630,8 @@ assert (utf16result[<span class="literal">3</span>] == <span class=
"literal">0xdd1e</span>);
</pre>
<p>
- In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
- thrown. If <code>end</code> does not point to the past-of-end of a UTF-8 seqence, a
+ In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
+ thrown. If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a
<code>utf8::not_enough_room</code> exception is thrown.
</p>
<h4>
@@ -713,8 +713,8 @@ utf8to32(twochars, twochars + <span class=
assert (utf32result.size() == <span class="literal">2</span>);
</pre>
<p>
- In case of an invalid UTF-8 seqence, a <code>utf8::invalid_utf8</code> exception is
- thrown. If <code>end</code> does not point to the past-of-end of a UTF-8 seqence, a
+ In case of an invalid UTF-8 sequence, a <code>utf8::invalid_utf8</code> exception is
+ thrown. If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a
<code>utf8::not_enough_room</code> exception is thrown.
</p>
<h4>
@@ -789,7 +789,7 @@ assert (bvalid == false);
</pre>
<p>
<code>is_valid</code> is a shorthand for <code>find_invalid(start, end) ==
- end;</code>. You may want to use it to make sure that a byte seqence is a valid
+ end;</code>. You may want to use it to make sure that a byte sequence is a valid
UTF-8 string without the need to know where it fails if it is not valid.
</p>
<h4>
@@ -1094,7 +1094,7 @@ assert (w == twochars);
Available in version 1.02 and later.
</p>
<p>
- Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
</p>
@@ -1133,7 +1133,7 @@ assert (w == twochars);
Deprecated in version 1.02 and later.
</p>
<p>
- Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
</p>
@@ -1219,7 +1219,7 @@ assert (w == twochars + <span class="literal">5</span>);
Available in version 1.0 and later.
</p>
<p>
- Given the iterators to two UTF-8 encoded code points in a seqence, returns the
+ Given the iterators to two UTF-8 encoded code points in a sequence, returns the
number of code points between them.
</p>
<pre>
--
Development for GoFind!
More information about the Pkg-games-commits
mailing list