r8435 - software/goplay/src
Miriam Ruiz
miriam at alioth.debian.org
Tue Nov 18 16:51:16 UTC 2008
Author: miriam
Date: 2008-11-18 16:51:16 +0000 (Tue, 18 Nov 2008)
New Revision: 8435
Added:
software/goplay/src/slre.c
software/goplay/src/slre.h
software/goplay/src/utf8.c
Log:
Add new files
Added: software/goplay/src/slre.c
===================================================================
--- software/goplay/src/slre.c (rev 0)
+++ software/goplay/src/slre.c 2008-11-18 16:51:16 UTC (rev 8435)
@@ -0,0 +1,663 @@
+/*
+ * Copyright (c) 2004-2005 Sergey Lyubka <valenok at gmail.com>
+ * All rights reserved
+ *
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * Sergey Lyubka wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "slre.h"
+
+enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
+ STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
+
+static struct {
+ const char *name;
+ int narg;
+ const char *flags;
+} opcodes[] = {
+ {"END", 0, ""}, /* End of code block or program */
+ {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
+ {"ANY", 0, ""}, /* Match any character, "." */
+ {"EXACT", 2, "d"}, /* Match exact string */
+ {"ANYOF", 2, "D"}, /* Match any from set, "[]" */
+ {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
+ {"OPEN ", 1, "i"}, /* Capture start, "(" */
+ {"CLOSE", 1, "i"}, /* Capture end, ")" */
+ {"BOL", 0, ""}, /* Beginning of string, "^" */
+ {"EOL", 0, ""}, /* End of string, "$" */
+ {"STAR", 1, "o"}, /* Match zero or more times "*" */
+ {"PLUS", 1, "o"}, /* Match one or more times, "+" */
+ {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
+ {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
+ {"QUEST", 1, "o"}, /* Match zero or one time, "?" */
+ {"SPACE", 0, ""}, /* Match whitespace, "\s" */
+ {"NONSPACE", 0, ""}, /* Match non-space, "\S" */
+ {"DIGIT", 0, ""} /* Match digit, "\d" */
+};
+
+/*
+ * Commands and operands are all unsigned char (1 byte long). All code offsets
+ * are relative to current address, and positive (always point forward). Data
+ * offsets are absolute. Commands with operands:
+ *
+ * BRANCH offset1 offset2
+ * Try to match the code block that follows the BRANCH instruction
+ * (code block ends with END). If no match, try to match code block that
+ * starts at offset1. If either of these match, jump to offset2.
+ *
+ * EXACT data_offset data_length
+ * Try to match exact string. String is recorded in data section from
+ * data_offset, and has length data_length.
+ *
+ * OPEN capture_number, CLOSE capture_number
+ * If the user have passed 'struct cap' array for captures, OPEN
+ * records the beginning of the matched substring (cap->ptr), CLOSE
+ * sets the length (cap->len) for respective capture_number.
+ *
+ * STAR code_offset, PLUS code_offset, QUEST code_offset
+ * *, +, ?, respectively. Try to gobble as much as possible from the
+ * matched buffer, until code block that follows these instructions
+ * matches. When the longest possible string is matched,
+ * jump to code_offset
+ *
+ * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
+ */
+
+static const char *meta_chars = "|.^$*+?()[\\";
+
+static void
+print_character_set(FILE *fp, const unsigned char *p, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (i > 0)
+ (void) fputc(',', fp);
+ if (p[i] == 0) {
+ i++;
+ if (p[i] == 0)
+ (void) fprintf(fp, "\\x%02x", p[i]);
+ else
+ (void) fprintf(fp, "%s", opcodes[p[i]].name);
+ } else if (isprint(p[i])) {
+ (void) fputc(p[i], fp);
+ } else {
+ (void) fprintf(fp,"\\x%02x", p[i]);
+ }
+ }
+}
+
+void
+slre_dump(const struct slre *r, FILE *fp)
+{
+ int i, j, ch, op, pc;
+
+ for (pc = 0; pc < r->code_size; pc++) {
+
+ op = r->code[pc];
+ (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
+
+ for (i = 0; opcodes[op].flags[i] != '\0'; i++)
+ switch (opcodes[op].flags[i]) {
+ case 'i':
+ (void) fprintf(fp, "%d ", r->code[pc + 1]);
+ pc++;
+ break;
+ case 'o':
+ (void) fprintf(fp, "%d ",
+ pc + r->code[pc + 1] - i);
+ pc++;
+ break;
+ case 'D':
+ print_character_set(fp, r->data +
+ r->code[pc + 1], r->code[pc + 2]);
+ pc += 2;
+ break;
+ case 'd':
+ (void) fputc('"', fp);
+ for (j = 0; j < r->code[pc + 2]; j++) {
+ ch = r->data[r->code[pc + 1] + j];
+ if (isprint(ch))
+ (void) fputc(ch, fp);
+ else
+ (void) fprintf(fp,"\\x%02x",ch);
+ }
+ (void) fputc('"', fp);
+ pc += 2;
+ break;
+ }
+
+ (void) fputc('\n', fp);
+ }
+}
+
+static void
+set_jump_offset(struct slre *r, int pc, int offset)
+{
+ assert(offset < r->code_size);
+
+ if (r->code_size - offset > 0xff) {
+ r->err_str = "Jump offset is too big";
+ } else {
+ r->code[pc] = (unsigned char) (r->code_size - offset);
+ }
+}
+
+static void
+emit(struct slre *r, int code)
+{
+ if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
+ r->err_str = "RE is too long (code overflow)";
+ else
+ r->code[r->code_size++] = (unsigned char) code;
+}
+
+static void
+store_char_in_data(struct slre *r, int ch)
+{
+ if (r->data_size >= (int) sizeof(r->data))
+ r->err_str = "RE is too long (data overflow)";
+ else
+ r->data[r->data_size++] = ch;
+}
+
+static void
+exact(struct slre *r, const char **re)
+{
+ int old_data_size = r->data_size;
+
+ while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
+ store_char_in_data(r, *(*re)++);
+
+ emit(r, EXACT);
+ emit(r, old_data_size);
+ emit(r, r->data_size - old_data_size);
+}
+
+static int
+get_escape_char(const char **re)
+{
+ int res;
+
+ switch (*(*re)++) {
+ case 'n': res = '\n'; break;
+ case 'r': res = '\r'; break;
+ case 't': res = '\t'; break;
+ case '0': res = 0; break;
+ case 'S': res = NONSPACE << 8; break;
+ case 's': res = SPACE << 8; break;
+ case 'd': res = DIGIT << 8; break;
+ default: res = (*re)[-1]; break;
+ }
+
+ return (res);
+}
+
+static void
+anyof(struct slre *r, const char **re)
+{
+ int esc, old_data_size = r->data_size, op = ANYOF;
+
+ if (**re == '^') {
+ op = ANYBUT;
+ (*re)++;
+ }
+
+ while (**re != '\0')
+
+ switch (*(*re)++) {
+ case ']':
+ emit(r, op);
+ emit(r, old_data_size);
+ emit(r, r->data_size - old_data_size);
+ return;
+ /* NOTREACHED */
+ break;
+ case '\\':
+ esc = get_escape_char(re);
+ if ((esc & 0xff) == 0) {
+ store_char_in_data(r, 0);
+ store_char_in_data(r, esc >> 8);
+ } else {
+ store_char_in_data(r, esc);
+ }
+ break;
+ default:
+ store_char_in_data(r, (*re)[-1]);
+ break;
+ }
+
+ r->err_str = "No closing ']' bracket";
+}
+
+static void
+relocate(struct slre *r, int begin, int shift)
+{
+ emit(r, END);
+ memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
+ r->code_size += shift;
+}
+
+static void
+quantifier(struct slre *r, int prev, int op)
+{
+ if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
+ r->code[prev + 2]--;
+ emit(r, EXACT);
+ emit(r, r->code[prev + 1] + r->code[prev + 2]);
+ emit(r, 1);
+ prev = r->code_size - 3;
+ }
+ relocate(r, prev, 2);
+ r->code[prev] = op;
+ set_jump_offset(r, prev + 1, prev);
+}
+
+static void
+exact_one_char(struct slre *r, int ch)
+{
+ emit(r, EXACT);
+ emit(r, r->data_size);
+ emit(r, 1);
+ store_char_in_data(r, ch);
+}
+
+static void
+fixup_branch(struct slre *r, int fixup)
+{
+ if (fixup > 0) {
+ emit(r, END);
+ set_jump_offset(r, fixup, fixup - 2);
+ }
+}
+
+static void
+compile(struct slre *r, const char **re)
+{
+ int op, esc, branch_start, last_op, fixup, cap_no, level;
+
+ fixup = 0;
+ level = r->num_caps;
+ branch_start = last_op = r->code_size;
+
+ for (;;)
+ switch (*(*re)++) {
+ case '\0':
+ (*re)--;
+ return;
+ /* NOTREACHED */
+ break;
+ case '^':
+ emit(r, BOL);
+ break;
+ case '$':
+ emit(r, EOL);
+ break;
+ case '.':
+ last_op = r->code_size;
+ emit(r, ANY);
+ break;
+ case '[':
+ anyof(r, re);
+ break;
+ case '\\':
+ last_op = r->code_size;
+ esc = get_escape_char(re);
+ if (esc & 0xff00) {
+ emit(r, esc >> 8);
+ } else {
+ exact_one_char(r, esc);
+ }
+ break;
+ case '(':
+ last_op = r->code_size;
+ cap_no = ++r->num_caps;
+ emit(r, OPEN);
+ emit(r, cap_no);
+
+ compile(r, re);
+ if (*(*re)++ != ')') {
+ r->err_str = "No closing bracket";
+ return;
+ }
+
+ emit(r, CLOSE);
+ emit(r, cap_no);
+ break;
+ case ')':
+ (*re)--;
+ fixup_branch(r, fixup);
+ if (level == 0) {
+ r->err_str = "Unbalanced brackets";
+ return;
+ }
+ return;
+ /* NOTREACHED */
+ break;
+ case '+':
+ case '*':
+ op = (*re)[-1] == '*' ? STAR: PLUS;
+ if (**re == '?') {
+ (*re)++;
+ op = STAR ? STARQ : PLUSQ;
+ }
+ quantifier(r, last_op, op);
+ break;
+ case '?':
+ quantifier(r, last_op, QUEST);
+ break;
+ case '|':
+ fixup_branch(r, fixup);
+ relocate(r, branch_start, 3);
+ r->code[branch_start] = BRANCH;
+ set_jump_offset(r, branch_start + 1, branch_start);
+ fixup = branch_start + 2;
+ r->code[fixup] = 0xff;
+ break;
+ default:
+ (*re)--;
+ last_op = r->code_size;
+ exact(r, re);
+ break;
+ }
+}
+
+int
+slre_compile(struct slre *r, const char *re)
+{
+ r->err_str = NULL;
+ r->code_size = r->data_size = r->num_caps = r->anchored = 0;
+
+ if (*re == '^')
+ r->anchored++;
+
+ emit(r, OPEN); /* This will capture what matches full RE */
+ emit(r, 0);
+
+ while (*re != '\0')
+ compile(r, &re);
+
+ if (r->code[2] == BRANCH)
+ fixup_branch(r, 4);
+
+ emit(r, CLOSE);
+ emit(r, 0);
+ emit(r, END);
+
+ return (r->err_str == NULL ? 1 : 0);
+}
+
+static int match(const struct slre *, int,
+ const char *, int, int *, struct cap *);
+
+static void
+loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
+{
+ int saved_offset, matched_offset;
+
+ saved_offset = matched_offset = *ofs;
+
+ while (match(r, pc + 2, s, len, ofs, NULL)) {
+ saved_offset = *ofs;
+ if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
+ matched_offset = saved_offset;
+ *ofs = saved_offset;
+ }
+
+ *ofs = matched_offset;
+}
+
+static void
+loop_non_greedy(const struct slre *r, int pc, const char *s,int len, int *ofs)
+{
+ int saved_offset = *ofs;
+
+ while (match(r, pc + 2, s, len, ofs, NULL)) {
+ saved_offset = *ofs;
+ if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
+ break;
+ }
+
+ *ofs = saved_offset;
+}
+
+static int
+is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
+{
+ int i, ch;
+
+ ch = s[*ofs];
+
+ for (i = 0; i < len; i++)
+ if (p[i] == ch) {
+ (*ofs)++;
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
+{
+ int i, ch;
+
+ ch = s[*ofs];
+
+ for (i = 0; i < len; i++)
+ if (p[i] == ch)
+ return (0);
+
+ (*ofs)++;
+ return (1);
+}
+
+static int
+match(const struct slre *r, int pc, const char *s, int len,
+ int *ofs, struct cap *caps)
+{
+ int n, saved_offset, res = 1;
+
+ while (res && r->code[pc] != END) {
+
+ assert(pc < r->code_size);
+ assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
+
+ switch (r->code[pc]) {
+ case BRANCH:
+ saved_offset = *ofs;
+ res = match(r, pc + 3, s, len, ofs, caps);
+ if (res == 0) {
+ *ofs = saved_offset;
+ res = match(r, pc + r->code[pc + 1],
+ s, len, ofs, caps);
+ }
+ pc += r->code[pc + 2];
+ break;
+ case EXACT:
+ res = 0;
+ n = r->code[pc + 2]; /* String length */
+ if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
+ r->code[pc + 1], n)) {
+ (*ofs) += n;
+ res = 1;
+ }
+ pc += 3;
+ break;
+ case QUEST:
+ res = 1;
+ saved_offset = *ofs;
+ if (!match(r, pc + 2, s, len, ofs, caps))
+ *ofs = saved_offset;
+ pc += r->code[pc + 1];
+ break;
+ case STAR:
+ res = 1;
+ loop_greedy(r, pc, s, len, ofs);
+ pc += r->code[pc + 1];
+ break;
+ case STARQ:
+ res = 1;
+ loop_non_greedy(r, pc, s, len, ofs);
+ pc += r->code[pc + 1];
+ break;
+ case PLUS:
+ if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
+ break;
+
+ loop_greedy(r, pc, s, len, ofs);
+ pc += r->code[pc + 1];
+ break;
+ case PLUSQ:
+ if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
+ break;
+
+ loop_non_greedy(r, pc, s, len, ofs);
+ pc += r->code[pc + 1];
+ break;
+ case SPACE:
+ res = 0;
+ if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
+ (*ofs)++;
+ res = 1;
+ }
+ pc++;
+ break;
+ case NONSPACE:
+ res = 0;
+ if (*ofs <len && !isspace(((unsigned char *)s)[*ofs])) {
+ (*ofs)++;
+ res = 1;
+ }
+ pc++;
+ break;
+ case DIGIT:
+ res = 0;
+ if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
+ (*ofs)++;
+ res = 1;
+ }
+ pc++;
+ break;
+ case ANY:
+ res = 0;
+ if (*ofs < len) {
+ (*ofs)++;
+ res = 1;
+ }
+ pc++;
+ break;
+ case ANYOF:
+ res = 0;
+ if (*ofs < len)
+ res = is_any_of(r->data + r->code[pc + 1],
+ r->code[pc + 2], s, ofs);
+ pc += 3;
+ break;
+ case ANYBUT:
+ res = 0;
+ if (*ofs < len)
+ res = is_any_but(r->data + r->code[pc + 1],
+ r->code[pc + 2], s, ofs);
+ pc += 3;
+ break;
+ case BOL:
+ res = *ofs == 0 ? 1 : 0;
+ pc++;
+ break;
+ case EOL:
+ res = *ofs == len ? 1 : 0;
+ pc++;
+ break;
+ case OPEN:
+ if (caps != NULL)
+ caps[r->code[pc + 1]].ptr = s + *ofs;
+ pc += 2;
+ break;
+ case CLOSE:
+ if (caps != NULL)
+ caps[r->code[pc + 1]].len = (s + *ofs) -
+ caps[r->code[pc + 1]].ptr;
+ pc += 2;
+ break;
+ case END:
+ pc++;
+ break;
+ default:
+ printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
+ assert(0);
+ break;
+ }
+ }
+
+ return (res);
+}
+
+int
+slre_match(const struct slre *r, const char *buf, int len,
+ struct cap *caps)
+{
+ int i, ofs = 0, res = 0;
+
+ if (r->anchored) {
+ res = match(r, 0, buf, len, &ofs, caps);
+ } else {
+ for (i = 0; i < len && res == 0; i++) {
+ ofs = i;
+ res = match(r, 0, buf, len, &ofs, caps);
+ }
+ }
+
+ return (res);
+}
+
+#ifdef TEST
+int main(int argc, char *argv[])
+{
+ struct slre slre;
+ struct cap caps[20];
+ char data[1 * 1024 * 1024];
+ FILE *fp;
+ int i, count, res, len;
+
+ if (argc < 3) {
+ printf("Usage: %s 'slre' <file> [count]\n", argv[0]);
+ } else if ((fp = fopen(argv[2], "rb")) == NULL) {
+ printf("Error: cannot open %s:%s\n", argv[2], strerror(errno));
+ } else if (!slre_compile(&slre, argv[1])) {
+ printf("Error compiling slre: %s\n", slre.err_str);
+ } else {
+ slre_dump(&slre, stderr);
+
+ (void) memset(caps, 0, sizeof(caps));
+
+ /* Read first 128K of file */
+ len = fread(data, 1, sizeof(data), fp);
+ (void) fclose(fp);
+
+ res = 0;
+ count = argc > 3 ? atoi(argv[3]) : 1;
+ for (i = 0; i < count; i++)
+ res = slre_match(&slre, data, len, caps);
+
+ printf("Result: %d\n", res);
+
+ for (i = 0; i < 20; i++)
+ if (caps[i].len > 0)
+ printf("Substring %d: [%.*s]\n", i,
+ caps[i].len, caps[i].ptr);
+ }
+
+ return (0);
+}
+#endif /* TEST */
Added: software/goplay/src/slre.h
===================================================================
--- software/goplay/src/slre.h (rev 0)
+++ software/goplay/src/slre.h 2008-11-18 16:51:16 UTC (rev 8435)
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2004-2005 Sergey Lyubka <valenok at gmail.com>
+ * All rights reserved
+ *
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * Sergey Lyubka wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.
+ */
+
+/*
+ * This is a regular expression library that implements a subset of Perl RE.
+ * Please refer to http://slre.sourceforge.net for detailed description.
+ *
+ * Usage example (parsing HTTP request):
+ *
+ * struct slre slre;
+ * struct cap captures[4 + 1]; // Number of braket pairs + 1
+ * ...
+ *
+ * slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n");
+ *
+ * if (slre_match(&slre, buf, len, captures)) {
+ * printf("Request line length: %d\n", captures[0].len);
+ * printf("Method: %.*s\n", captures[1].len, captures[1].ptr);
+ * printf("URI: %.*s\n", captures[2].len, captures[2].ptr);
+ * }
+ *
+ * Supported syntax:
+ * ^ Match beginning of a buffer
+ * $ Match end of a buffer
+ * () Grouping and substring capturing
+ * [...] Match any character from set
+ * [^...] Match any character but ones from set
+ * \s Match whitespace
+ * \S Match non-whitespace
+ * \d Match decimal digit
+ * \r Match carriage return
+ * \n Match newline
+ * + Match one or more times (greedy)
+ * +? Match one or more times (non-greedy)
+ * * Match zero or more times (greedy)
+ * *? Match zero or more times (non-greedy)
+ * ? Match zero or once
+ * \xDD Match byte with hex value 0xDD
+ * \meta Match one of the meta character: ^$().[*+?\
+ */
+
+#ifndef SLRE_HEADER_DEFINED
+#define SLRE_HEADER_DEFINED
+
+/*
+ * Compiled regular expression
+ */
+struct slre {
+ unsigned char code[256];
+ unsigned char data[256];
+ int code_size;
+ int data_size;
+ int num_caps; /* Number of bracket pairs */
+ int anchored; /* Must match from string start */
+ const char *err_str; /* Error string */
+};
+
+/*
+ * Captured substring
+ */
+struct cap {
+ const char *ptr; /* Pointer to the substring */
+ int len; /* Substring length */
+};
+
+/*
+ * Compile regular expression. If success, 1 is returned.
+ * If error, 0 is returned and slre.err_str points to the error message.
+ */
+int slre_compile(struct slre *, const char *re);
+
+/*
+ * Return 1 if match, 0 if no match.
+ * If `captured_substrings' array is not NULL, then it is filled with the
+ * values of captured substrings. captured_substrings[0] element is always
+ * a full matched substring. The round bracket captures start from
+ * captured_substrings[1].
+ * It is assumed that the size of captured_substrings array is enough to
+ * hold all captures. The caller function must make sure it is! So, the
+ * array_size = number_of_round_bracket_pairs + 1
+ */
+int slre_match(const struct slre *, const char *buf, int buf_len,
+ struct cap *captured_substrings);
+
+#endif /* SLRE_HEADER_DEFINED */
Added: software/goplay/src/utf8.c
===================================================================
--- software/goplay/src/utf8.c (rev 0)
+++ software/goplay/src/utf8.c 2008-11-18 16:51:16 UTC (rev 8435)
@@ -0,0 +1,814 @@
+/* -*- Mode: c; c-basic-offset: 2 -*-
+ *
+ * helper_utf8.c - Raptor UTF-8 and Unicode support
+ *
+ * Copyright (C) 2002-2006, David Beckett http://purl.org/net/dajobe/
+ * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
+ *
+ * This package is Free Software and part of Redland http://librdf.org/
+ *
+ * It is licensed under the following three licenses as alternatives:
+ * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
+ * 2. GNU General Public License (GPL) V2 or any newer version
+ * 3. Apache License, V2.0 or any newer version
+ *
+ * You may not use this file except in compliance with at least one of
+ * the above three licenses.
+ *
+ * See LICENSE.html or LICENSE.txt at the top of this package for the
+ * complete terms and further detail along with the license texts for
+ * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
+ *
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/**
+ * helper_unicode_char_to_utf8:
+ * @c: Unicode character
+ * @output: UTF-8 string buffer or NULL
+ *
+ * Convert a Unicode character to UTF-8 encoding.
+ *
+ * Based on librdf_unicode_char_to_utf8() with no need to calculate
+ * length since the encoded character is always copied into a buffer
+ * with sufficient size.
+ *
+ * Return value: bytes encoded to output buffer or <0 on failure
+ **/
+int
+helper_unicode_char_to_utf8(unsigned long c, unsigned char *output)
+{
+ int size=0;
+
+ if (c < 0x00000080)
+ size=1;
+ else if (c < 0x00000800)
+ size=2;
+ else if (c < 0x00010000)
+ size=3;
+ else if (c < 0x00200000)
+ size=4;
+ else if (c < 0x04000000)
+ size=5;
+ else if (c < 0x80000000)
+ size=6;
+ else
+ return -1;
+
+ switch(size) {
+ case 6:
+ output[5]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bit 2 (bits 7,6,5,4,3,2 less 7,6,5,4,3 set below) on last byte */
+ c |= 0x4000000; /* 0x10000 = 0x04 << 24 */
+ /* FALLTHROUGH */
+ case 5:
+ output[4]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bit 3 (bits 7,6,5,4,3 less 7,6,5,4 set below) on last byte */
+ c |= 0x200000; /* 0x10000 = 0x08 << 18 */
+ /* FALLTHROUGH */
+ case 4:
+ output[3]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
+ c |= 0x10000; /* 0x10000 = 0x10 << 12 */
+ /* FALLTHROUGH */
+ case 3:
+ output[2]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
+ c |= 0x800; /* 0x800 = 0x20 << 6 */
+ /* FALLTHROUGH */
+ case 2:
+ output[1]=0x80 | (unsigned char)(c & 0x3F);
+ c= c >> 6;
+ /* set bits 7,6 on last byte */
+ c |= 0xc0;
+ /* FALLTHROUGH */
+ case 1:
+ output[0]=(unsigned char)c;
+ }
+
+ return size;
+}
+
+
+/**
+ * helper_utf8_to_unicode_char:
+ * @output: Pointer to the Unicode character or NULL
+ * @input: UTF-8 string buffer
+ * @length: buffer size
+ *
+ * Convert an UTF-8 encoded buffer to a Unicode character.
+ *
+ * If output is NULL, then will calculate the number of bytes that
+ * will be used from the input buffer and not perform the conversion.
+ *
+ * Return value: bytes used from input buffer or <0 on failure: -1 input buffer too short or length error, -2 overlong UTF-8 sequence, -3 illegal code positions, -4 code out of range U+0000 to U+10FFFF. In cases -2, -3 and -4 the coded character is stored in the output.
+ */
+int
+helper_utf8_to_unicode_char(unsigned long *output,
+ const unsigned char *input, int length)
+{
+ unsigned char in;
+ int size;
+ unsigned long c=0;
+
+ if(length < 1)
+ return -1;
+
+ in=*input++;
+ if((in & 0x80) == 0) {
+ size=1;
+ c= in & 0x7f;
+ } else if((in & 0xe0) == 0xc0) {
+ size=2;
+ c= in & 0x1f;
+ } else if((in & 0xf0) == 0xe0) {
+ size=3;
+ c= in & 0x0f;
+ } else if((in & 0xf8) == 0xf0) {
+ size=4;
+ c = in & 0x07;
+ } else if((in & 0xfc) == 0xf8) {
+ size=5;
+ c = in & 0x03;
+ } else if((in & 0xfe) == 0xfc) {
+ size=6;
+ c = in & 0x01;
+ } else
+ return -1;
+
+
+ if(!output)
+ return size;
+
+ if(length < size)
+ return -1;
+
+ switch(size) {
+ case 6:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 5:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 4:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 3:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ case 2:
+ in=*input++ & 0x3f;
+ c= c << 6;
+ c |= in;
+ /* FALLTHROUGH */
+ default:
+ break;
+ }
+
+ *output=c;
+
+ /* check for overlong UTF-8 sequences */
+ switch(size) {
+ case 2:
+ if(c < 0x00000080)
+ return -2;
+ break;
+ case 3:
+ if(c < 0x00000800)
+ return -2;
+ break;
+ case 4:
+ if(c < 0x00010000)
+ return -2;
+ break;
+
+ default: /* 1 */
+ break;
+ }
+
+
+ /* check for illegal code positions:
+ * U+D800 to U+DFFF (UTF-16 surrogates)
+ * U+FFFE and U+FFFF
+ */
+ if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
+ return -3;
+
+ /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
+ /* of course this makes some 4 byte forms illegal */
+ if(c > 0x10ffff)
+ return -4;
+
+ return size;
+}
+
+
+static int helper_unicode_is_letter(long c);
+static int helper_unicode_is_basechar(long c);
+static int helper_unicode_is_ideographic(long c);
+static int helper_unicode_is_combiningchar(long c);
+static int helper_unicode_is_digit(long c);
+static int helper_unicode_is_extender(long c);
+
+
+/**
+ * helper_unicode_is_xml11_namestartchar:
+ * @c: Unicode character to check
+ *
+ * Check if Unicode character is legal to start an XML 1.1 Name
+ *
+ * Namespaces in XML 1.1 REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
+ * updating
+ * Extensible Markup Language (XML) 1.1 REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int
+helper_unicode_is_xml11_namestartchar(long c)
+{
+ return (((c >= 0x0041) && (c <= 0x005A)) || /* [A-Z] */
+ (c == 0x005F) || /* '_' */
+ ((c >= 0x0061) && (c <= 0x007A)) || /* [a-z] */
+ ((c >= 0x00C0) && (c <= 0x00D6)) ||
+ ((c >= 0x00D8) && (c <= 0x00F6)) ||
+ ((c >= 0x00F8) && (c <= 0x02FF)) ||
+ ((c >= 0x0370) && (c <= 0x037D)) ||
+ ((c >= 0x037F) && (c <= 0x1FFF)) ||
+ ((c >= 0x200C) && (c <= 0x200D)) ||
+ ((c >= 0x2070) && (c <= 0x218F)) ||
+ ((c >= 0x2C00) && (c <= 0x2FEF)) ||
+ ((c >= 0x3001) && (c <= 0xD7FF)) ||
+ ((c >= 0xF900) && (c <= 0xFDCF)) ||
+ ((c >= 0xFDF0) && (c <= 0xFFFD)) ||
+ ((c >= 0x10000) && (c <= 0xEFFFF)));
+}
+
+
+/**
+ * helper_unicode_is_xml10_namestartchar:
+ * @c: Unicode character to check
+ *
+ * Check if Unicode character is legal to start an XML 1.0 Name
+ *
+ * Namespaces in XML REC 1999-01-14
+ * http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
+ * updating
+ * Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml-20040204/
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int
+helper_unicode_is_xml10_namestartchar(long c)
+{
+ return (helper_unicode_is_letter(c) ||
+ (c == '_'));
+}
+
+
+/**
+ * helper_unicode_is_namestartchar:
+ * @c: Unicode character to check
+ *
+ * Check if Unicode character is legal to start an XML Name
+ *
+ * Return value: non-0 if the character is legal
+ **/
+int
+helper_unicode_is_namestartchar(long c) {
+#ifdef helper_XML_1_1
+ return helper_unicode_is_xml11_namestartchar(c);
+#else
+ return helper_unicode_is_xml10_namestartchar(c);
+#endif
+}
+
+
+/**
+ * helper_unicode_is_xml11_namechar:
+ * @c: Unicode character
+ *
+ * Check if a Unicode codepoint is a legal to continue an XML 1.1 Name
+ *
+ * Namespaces in XML 1.1 REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml11-20040204/
+ * updating
+ * Extensible Markup Language (XML) 1.1 REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int
+helper_unicode_is_xml11_namechar(long c)
+{
+ return (helper_unicode_is_xml11_namestartchar(c) ||
+ (c == 0x002D) || /* '-' */
+ (c == 0x002E) || /* '.' */
+ (c >= 0x0030 && c <= 0x0039) || /* 0-9 */
+ (c == 0x00B7) ||
+ (c >= 0x0300 && c <=0x036F) ||
+ (c >= 0x203F && c <=0x2040));
+}
+
+
+/**
+ * helper_unicode_is_xml10_namechar:
+ * @c: Unicode character
+ *
+ * Check if a Unicode codepoint is a legal to continue an XML 1.0 Name
+ *
+ * Namespaces in XML REC 1999-01-14
+ * http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCNameChar
+ * updating
+ * Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
+ * http://www.w3.org/TR/2004/REC-xml-20040204/
+ * excluding the ':'
+ *
+ * Return value: non-0 if legal
+ **/
+int
+helper_unicode_is_xml10_namechar(long c)
+{
+ return (helper_unicode_is_letter(c) ||
+ helper_unicode_is_digit(c) ||
+ (c == 0x002E) || /* '.' */
+ (c == 0x002D) || /* '-' */
+ (c == 0x005F) || /* '_' */
+ helper_unicode_is_combiningchar(c) ||
+ helper_unicode_is_extender(c));
+}
+
+
+/**
+ * helper_unicode_is_namechar:
+ * @c: Unicode character to check
+ *
+ * Check if Unicode character is legal to continue an XML Name .
+ *
+ * Return value: non-0 if the character is legal
+ **/
+int
+helper_unicode_is_namechar(long c)
+{
+#ifdef helper_XML_1_1
+ return helper_unicode_is_xml11_namechar(c);
+#else
+ return helper_unicode_is_xml10_namechar(c);
+#endif
+}
+
+
+/*
+ * All this below was derived by machine-transforming the classes in Appendix B
+ * of http://www.w3.org/TR/2000/REC-xml-20001006
+ */
+
+static int
+helper_unicode_is_letter(long c)
+{
+ return(helper_unicode_is_basechar(c) ||
+ helper_unicode_is_ideographic(c));
+}
+
+
+static int
+helper_unicode_is_basechar(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-BaseChar */
+ return((c >= 0x0041 && c <= 0x005A ) ||
+ (c >= 0x0061 && c <= 0x007A ) ||
+ (c >= 0x00C0 && c <= 0x00D6 ) ||
+ (c >= 0x00D8 && c <= 0x00F6 ) ||
+ (c >= 0x00F8 && c <= 0x00FF ) ||
+ (c >= 0x0100 && c <= 0x0131 ) ||
+ (c >= 0x0134 && c <= 0x013E ) ||
+ (c >= 0x0141 && c <= 0x0148 ) ||
+ (c >= 0x014A && c <= 0x017E ) ||
+ (c >= 0x0180 && c <= 0x01C3 ) ||
+ (c >= 0x01CD && c <= 0x01F0 ) ||
+ (c >= 0x01F4 && c <= 0x01F5 ) ||
+ (c >= 0x01FA && c <= 0x0217 ) ||
+ (c >= 0x0250 && c <= 0x02A8 ) ||
+ (c >= 0x02BB && c <= 0x02C1 ) ||
+ (c == 0x0386) ||
+ (c >= 0x0388 && c <= 0x038A ) ||
+ (c == 0x038C) ||
+ (c >= 0x038E && c <= 0x03A1 ) ||
+ (c >= 0x03A3 && c <= 0x03CE ) ||
+ (c >= 0x03D0 && c <= 0x03D6 ) ||
+ (c == 0x03DA) ||
+ (c == 0x03DC) ||
+ (c == 0x03DE) ||
+ (c == 0x03E0) ||
+ (c >= 0x03E2 && c <= 0x03F3 ) ||
+ (c >= 0x0401 && c <= 0x040C ) ||
+ (c >= 0x040E && c <= 0x044F ) ||
+ (c >= 0x0451 && c <= 0x045C ) ||
+ (c >= 0x045E && c <= 0x0481 ) ||
+ (c >= 0x0490 && c <= 0x04C4 ) ||
+ (c >= 0x04C7 && c <= 0x04C8 ) ||
+ (c >= 0x04CB && c <= 0x04CC ) ||
+ (c >= 0x04D0 && c <= 0x04EB ) ||
+ (c >= 0x04EE && c <= 0x04F5 ) ||
+ (c >= 0x04F8 && c <= 0x04F9 ) ||
+ (c >= 0x0531 && c <= 0x0556 ) ||
+ (c == 0x0559) ||
+ (c >= 0x0561 && c <= 0x0586 ) ||
+ (c >= 0x05D0 && c <= 0x05EA ) ||
+ (c >= 0x05F0 && c <= 0x05F2 ) ||
+ (c >= 0x0621 && c <= 0x063A ) ||
+ (c >= 0x0641 && c <= 0x064A ) ||
+ (c >= 0x0671 && c <= 0x06B7 ) ||
+ (c >= 0x06BA && c <= 0x06BE ) ||
+ (c >= 0x06C0 && c <= 0x06CE ) ||
+ (c >= 0x06D0 && c <= 0x06D3 ) ||
+ (c == 0x06D5) ||
+ (c >= 0x06E5 && c <= 0x06E6 ) ||
+ (c >= 0x0905 && c <= 0x0939 ) ||
+ (c == 0x093D) ||
+ (c >= 0x0958 && c <= 0x0961 ) ||
+ (c >= 0x0985 && c <= 0x098C ) ||
+ (c >= 0x098F && c <= 0x0990 ) ||
+ (c >= 0x0993 && c <= 0x09A8 ) ||
+ (c >= 0x09AA && c <= 0x09B0 ) ||
+ (c == 0x09B2) ||
+ (c >= 0x09B6 && c <= 0x09B9 ) ||
+ (c >= 0x09DC && c <= 0x09DD ) ||
+ (c >= 0x09DF && c <= 0x09E1 ) ||
+ (c >= 0x09F0 && c <= 0x09F1 ) ||
+ (c >= 0x0A05 && c <= 0x0A0A ) ||
+ (c >= 0x0A0F && c <= 0x0A10 ) ||
+ (c >= 0x0A13 && c <= 0x0A28 ) ||
+ (c >= 0x0A2A && c <= 0x0A30 ) ||
+ (c >= 0x0A32 && c <= 0x0A33 ) ||
+ (c >= 0x0A35 && c <= 0x0A36 ) ||
+ (c >= 0x0A38 && c <= 0x0A39 ) ||
+ (c >= 0x0A59 && c <= 0x0A5C ) ||
+ (c == 0x0A5E) ||
+ (c >= 0x0A72 && c <= 0x0A74 ) ||
+ (c >= 0x0A85 && c <= 0x0A8B ) ||
+ (c == 0x0A8D) ||
+ (c >= 0x0A8F && c <= 0x0A91 ) ||
+ (c >= 0x0A93 && c <= 0x0AA8 ) ||
+ (c >= 0x0AAA && c <= 0x0AB0 ) ||
+ (c >= 0x0AB2 && c <= 0x0AB3 ) ||
+ (c >= 0x0AB5 && c <= 0x0AB9 ) ||
+ (c == 0x0ABD) ||
+ (c == 0x0AE0) ||
+ (c >= 0x0B05 && c <= 0x0B0C ) ||
+ (c >= 0x0B0F && c <= 0x0B10 ) ||
+ (c >= 0x0B13 && c <= 0x0B28 ) ||
+ (c >= 0x0B2A && c <= 0x0B30 ) ||
+ (c >= 0x0B32 && c <= 0x0B33 ) ||
+ (c >= 0x0B36 && c <= 0x0B39 ) ||
+ (c == 0x0B3D) ||
+ (c >= 0x0B5C && c <= 0x0B5D ) ||
+ (c >= 0x0B5F && c <= 0x0B61 ) ||
+ (c >= 0x0B85 && c <= 0x0B8A ) ||
+ (c >= 0x0B8E && c <= 0x0B90 ) ||
+ (c >= 0x0B92 && c <= 0x0B95 ) ||
+ (c >= 0x0B99 && c <= 0x0B9A ) ||
+ (c == 0x0B9C) ||
+ (c >= 0x0B9E && c <= 0x0B9F ) ||
+ (c >= 0x0BA3 && c <= 0x0BA4 ) ||
+ (c >= 0x0BA8 && c <= 0x0BAA ) ||
+ (c >= 0x0BAE && c <= 0x0BB5 ) ||
+ (c >= 0x0BB7 && c <= 0x0BB9 ) ||
+ (c >= 0x0C05 && c <= 0x0C0C ) ||
+ (c >= 0x0C0E && c <= 0x0C10 ) ||
+ (c >= 0x0C12 && c <= 0x0C28 ) ||
+ (c >= 0x0C2A && c <= 0x0C33 ) ||
+ (c >= 0x0C35 && c <= 0x0C39 ) ||
+ (c >= 0x0C60 && c <= 0x0C61 ) ||
+ (c >= 0x0C85 && c <= 0x0C8C ) ||
+ (c >= 0x0C8E && c <= 0x0C90 ) ||
+ (c >= 0x0C92 && c <= 0x0CA8 ) ||
+ (c >= 0x0CAA && c <= 0x0CB3 ) ||
+ (c >= 0x0CB5 && c <= 0x0CB9 ) ||
+ (c == 0x0CDE) ||
+ (c >= 0x0CE0 && c <= 0x0CE1 ) ||
+ (c >= 0x0D05 && c <= 0x0D0C ) ||
+ (c >= 0x0D0E && c <= 0x0D10 ) ||
+ (c >= 0x0D12 && c <= 0x0D28 ) ||
+ (c >= 0x0D2A && c <= 0x0D39 ) ||
+ (c >= 0x0D60 && c <= 0x0D61 ) ||
+ (c >= 0x0E01 && c <= 0x0E2E ) ||
+ (c == 0x0E30) ||
+ (c >= 0x0E32 && c <= 0x0E33 ) ||
+ (c >= 0x0E40 && c <= 0x0E45 ) ||
+ (c >= 0x0E81 && c <= 0x0E82 ) ||
+ (c == 0x0E84) ||
+ (c >= 0x0E87 && c <= 0x0E88 ) ||
+ (c == 0x0E8A) ||
+ (c == 0x0E8D) ||
+ (c >= 0x0E94 && c <= 0x0E97 ) ||
+ (c >= 0x0E99 && c <= 0x0E9F ) ||
+ (c >= 0x0EA1 && c <= 0x0EA3 ) ||
+ (c == 0x0EA5) ||
+ (c == 0x0EA7) ||
+ (c >= 0x0EAA && c <= 0x0EAB ) ||
+ (c >= 0x0EAD && c <= 0x0EAE ) ||
+ (c == 0x0EB0) ||
+ (c >= 0x0EB2 && c <= 0x0EB3 ) ||
+ (c == 0x0EBD) ||
+ (c >= 0x0EC0 && c <= 0x0EC4 ) ||
+ (c >= 0x0F40 && c <= 0x0F47 ) ||
+ (c >= 0x0F49 && c <= 0x0F69 ) ||
+ (c >= 0x10A0 && c <= 0x10C5 ) ||
+ (c >= 0x10D0 && c <= 0x10F6 ) ||
+ (c == 0x1100) ||
+ (c >= 0x1102 && c <= 0x1103 ) ||
+ (c >= 0x1105 && c <= 0x1107 ) ||
+ (c == 0x1109) ||
+ (c >= 0x110B && c <= 0x110C ) ||
+ (c >= 0x110E && c <= 0x1112 ) ||
+ (c == 0x113C) ||
+ (c == 0x113E) ||
+ (c == 0x1140) ||
+ (c == 0x114C) ||
+ (c == 0x114E) ||
+ (c == 0x1150) ||
+ (c >= 0x1154 && c <= 0x1155 ) ||
+ (c == 0x1159) ||
+ (c >= 0x115F && c <= 0x1161 ) ||
+ (c == 0x1163) ||
+ (c == 0x1165) ||
+ (c == 0x1167) ||
+ (c == 0x1169) ||
+ (c >= 0x116D && c <= 0x116E ) ||
+ (c >= 0x1172 && c <= 0x1173 ) ||
+ (c == 0x1175) ||
+ (c == 0x119E) ||
+ (c == 0x11A8) ||
+ (c == 0x11AB) ||
+ (c >= 0x11AE && c <= 0x11AF ) ||
+ (c >= 0x11B7 && c <= 0x11B8 ) ||
+ (c == 0x11BA) ||
+ (c >= 0x11BC && c <= 0x11C2 ) ||
+ (c == 0x11EB) ||
+ (c == 0x11F0) ||
+ (c == 0x11F9) ||
+ (c >= 0x1E00 && c <= 0x1E9B ) ||
+ (c >= 0x1EA0 && c <= 0x1EF9 ) ||
+ (c >= 0x1F00 && c <= 0x1F15 ) ||
+ (c >= 0x1F18 && c <= 0x1F1D ) ||
+ (c >= 0x1F20 && c <= 0x1F45 ) ||
+ (c >= 0x1F48 && c <= 0x1F4D ) ||
+ (c >= 0x1F50 && c <= 0x1F57 ) ||
+ (c == 0x1F59) ||
+ (c == 0x1F5B) ||
+ (c == 0x1F5D) ||
+ (c >= 0x1F5F && c <= 0x1F7D ) ||
+ (c >= 0x1F80 && c <= 0x1FB4 ) ||
+ (c >= 0x1FB6 && c <= 0x1FBC ) ||
+ (c == 0x1FBE) ||
+ (c >= 0x1FC2 && c <= 0x1FC4 ) ||
+ (c >= 0x1FC6 && c <= 0x1FCC ) ||
+ (c >= 0x1FD0 && c <= 0x1FD3 ) ||
+ (c >= 0x1FD6 && c <= 0x1FDB ) ||
+ (c >= 0x1FE0 && c <= 0x1FEC ) ||
+ (c >= 0x1FF2 && c <= 0x1FF4 ) ||
+ (c >= 0x1FF6 && c <= 0x1FFC ) ||
+ (c == 0x2126) ||
+ (c >= 0x212A && c <= 0x212B ) ||
+ (c == 0x212E) ||
+ (c >= 0x2180 && c <= 0x2182 ) ||
+ (c >= 0x3041 && c <= 0x3094 ) ||
+ (c >= 0x30A1 && c <= 0x30FA ) ||
+ (c >= 0x3105 && c <= 0x312C ) ||
+ (c >= 0xAC00 && c <= 0xD7A3 )
+ );
+}
+
+
+static int
+helper_unicode_is_ideographic(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Ideographic */
+ return((c >= 0x4E00 && c <= 0x9FA5 ) ||
+ (c == 0x3007) ||
+ (c >= 0x3021 && c <= 0x3029 ));
+}
+
+
+static int
+helper_unicode_is_combiningchar(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-CombiningChar */
+ return((c >= 0x0300 && c <= 0x0345 ) ||
+ (c >= 0x0360 && c <= 0x0361 ) ||
+ (c >= 0x0483 && c <= 0x0486 ) ||
+ (c >= 0x0591 && c <= 0x05A1 ) ||
+ (c >= 0x05A3 && c <= 0x05B9 ) ||
+ (c >= 0x05BB && c <= 0x05BD ) ||
+ (c == 0x05BF) ||
+ (c >= 0x05C1 && c <= 0x05C2 ) ||
+ (c == 0x05C4) ||
+ (c >= 0x064B && c <= 0x0652 ) ||
+ (c == 0x0670) ||
+ (c >= 0x06D6 && c <= 0x06DC ) ||
+ (c >= 0x06DD && c <= 0x06DF ) ||
+ (c >= 0x06E0 && c <= 0x06E4 ) ||
+ (c >= 0x06E7 && c <= 0x06E8 ) ||
+ (c >= 0x06EA && c <= 0x06ED ) ||
+ (c >= 0x0901 && c <= 0x0903 ) ||
+ (c == 0x093C) ||
+ (c >= 0x093E && c <= 0x094C ) ||
+ (c == 0x094D) ||
+ (c >= 0x0951 && c <= 0x0954 ) ||
+ (c >= 0x0962 && c <= 0x0963 ) ||
+ (c >= 0x0981 && c <= 0x0983 ) ||
+ (c == 0x09BC) ||
+ (c == 0x09BE) ||
+ (c == 0x09BF) ||
+ (c >= 0x09C0 && c <= 0x09C4 ) ||
+ (c >= 0x09C7 && c <= 0x09C8 ) ||
+ (c >= 0x09CB && c <= 0x09CD ) ||
+ (c == 0x09D7) ||
+ (c >= 0x09E2 && c <= 0x09E3 ) ||
+ (c == 0x0A02) ||
+ (c == 0x0A3C) ||
+ (c == 0x0A3E) ||
+ (c == 0x0A3F) ||
+ (c >= 0x0A40 && c <= 0x0A42 ) ||
+ (c >= 0x0A47 && c <= 0x0A48 ) ||
+ (c >= 0x0A4B && c <= 0x0A4D ) ||
+ (c >= 0x0A70 && c <= 0x0A71 ) ||
+ (c >= 0x0A81 && c <= 0x0A83 ) ||
+ (c == 0x0ABC) ||
+ (c >= 0x0ABE && c <= 0x0AC5 ) ||
+ (c >= 0x0AC7 && c <= 0x0AC9 ) ||
+ (c >= 0x0ACB && c <= 0x0ACD ) ||
+ (c >= 0x0B01 && c <= 0x0B03 ) ||
+ (c == 0x0B3C) ||
+ (c >= 0x0B3E && c <= 0x0B43 ) ||
+ (c >= 0x0B47 && c <= 0x0B48 ) ||
+ (c >= 0x0B4B && c <= 0x0B4D ) ||
+ (c >= 0x0B56 && c <= 0x0B57 ) ||
+ (c >= 0x0B82 && c <= 0x0B83 ) ||
+ (c >= 0x0BBE && c <= 0x0BC2 ) ||
+ (c >= 0x0BC6 && c <= 0x0BC8 ) ||
+ (c >= 0x0BCA && c <= 0x0BCD ) ||
+ (c == 0x0BD7) ||
+ (c >= 0x0C01 && c <= 0x0C03 ) ||
+ (c >= 0x0C3E && c <= 0x0C44 ) ||
+ (c >= 0x0C46 && c <= 0x0C48 ) ||
+ (c >= 0x0C4A && c <= 0x0C4D ) ||
+ (c >= 0x0C55 && c <= 0x0C56 ) ||
+ (c >= 0x0C82 && c <= 0x0C83 ) ||
+ (c >= 0x0CBE && c <= 0x0CC4 ) ||
+ (c >= 0x0CC6 && c <= 0x0CC8 ) ||
+ (c >= 0x0CCA && c <= 0x0CCD ) ||
+ (c >= 0x0CD5 && c <= 0x0CD6 ) ||
+ (c >= 0x0D02 && c <= 0x0D03 ) ||
+ (c >= 0x0D3E && c <= 0x0D43 ) ||
+ (c >= 0x0D46 && c <= 0x0D48 ) ||
+ (c >= 0x0D4A && c <= 0x0D4D ) ||
+ (c == 0x0D57) ||
+ (c == 0x0E31) ||
+ (c >= 0x0E34 && c <= 0x0E3A ) ||
+ (c >= 0x0E47 && c <= 0x0E4E ) ||
+ (c == 0x0EB1) ||
+ (c >= 0x0EB4 && c <= 0x0EB9 ) ||
+ (c >= 0x0EBB && c <= 0x0EBC ) ||
+ (c >= 0x0EC8 && c <= 0x0ECD ) ||
+ (c >= 0x0F18 && c <= 0x0F19 ) ||
+ (c == 0x0F35) ||
+ (c == 0x0F37) ||
+ (c == 0x0F39) ||
+ (c == 0x0F3E) ||
+ (c == 0x0F3F) ||
+ (c >= 0x0F71 && c <= 0x0F84 ) ||
+ (c >= 0x0F86 && c <= 0x0F8B ) ||
+ (c >= 0x0F90 && c <= 0x0F95 ) ||
+ (c == 0x0F97) ||
+ (c >= 0x0F99 && c <= 0x0FAD ) ||
+ (c >= 0x0FB1 && c <= 0x0FB7 ) ||
+ (c == 0x0FB9) ||
+ (c >= 0x20D0 && c <= 0x20DC ) ||
+ (c == 0x20E1) ||
+ (c >= 0x302A && c <= 0x302F ) ||
+ (c == 0x3099) ||
+ (c == 0x309A));
+}
+
+
+static int
+helper_unicode_is_digit(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
+ return((c >= 0x0030 && c <= 0x0039 ) ||
+ (c >= 0x0660 && c <= 0x0669 ) ||
+ (c >= 0x06F0 && c <= 0x06F9 ) ||
+ (c >= 0x0966 && c <= 0x096F ) ||
+ (c >= 0x09E6 && c <= 0x09EF ) ||
+ (c >= 0x0A66 && c <= 0x0A6F ) ||
+ (c >= 0x0AE6 && c <= 0x0AEF ) ||
+ (c >= 0x0B66 && c <= 0x0B6F ) ||
+ (c >= 0x0BE7 && c <= 0x0BEF ) ||
+ (c >= 0x0C66 && c <= 0x0C6F ) ||
+ (c >= 0x0CE6 && c <= 0x0CEF ) ||
+ (c >= 0x0D66 && c <= 0x0D6F ) ||
+ (c >= 0x0E50 && c <= 0x0E59 ) ||
+ (c >= 0x0ED0 && c <= 0x0ED9 ) ||
+ (c >= 0x0F20 && c <= 0x0F29 ));
+}
+
+
+static int
+helper_unicode_is_extender(long c)
+{
+ /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Extender */
+ return((c == 0x00B7) ||
+ (c == 0x02D0) ||
+ (c == 0x02D1) ||
+ (c == 0x0387) ||
+ (c == 0x0640) ||
+ (c == 0x0E46) ||
+ (c == 0x0EC6) ||
+ (c == 0x3005) ||
+ (c >= 0x3031 && c <= 0x3035 ) ||
+ (c >= 0x309D && c <= 0x309E ) ||
+ (c >= 0x30FC && c <= 0x30FE ));
+}
+
+
+/**
+ * helper_utf8_is_nfc:
+ * @input: UTF-8 string
+ * @length: length of string
+ *
+ * Check a string is in Unicode Normal Form C.
+ *
+ * Return value: Non 0 if the string is NFC
+ **/
+int
+helper_utf8_is_nfc(const unsigned char *input, size_t length)
+{
+ unsigned int i;
+ int plain=1;
+
+ for(i=0; i<length; i++)
+ if(input[i]>0x7f) {
+ plain=0;
+ break;
+ }
+
+ if(plain)
+ return 1;
+
+#ifdef helper_NFC_CHECK
+ return helper_nfc_check(input, length, NULL);
+#else
+ return 1;
+#endif
+}
+
+
+/**
+ * helper_utf8_check:
+ * @string: UTF-8 string
+ * @length: length of string
+ *
+ * Check a string is UTF-8.
+ *
+ * Return value: Non 0 if the string is UTF-8
+ **/
+int
+helper_utf8_check(const unsigned char *string, size_t length)
+{
+ while(length > 0) {
+ unsigned long unichar=0;
+
+ int unichar_len=helper_utf8_to_unicode_char(&unichar, string, length);
+ if(unichar_len < 0 || unichar_len > (int)length)
+ return 0;
+
+ if(unichar > 0x10ffff)
+ return 0;
+
+ string += unichar_len;
+ length -= unichar_len;
+ }
+ return 1;
+}
More information about the Pkg-games-commits
mailing list