[SCM] Lisaac small bindings branch, master, updated. bf8ba16dbaf185a5fd6199e56aabf23b68afb4f5

Fri Mar 27 00:07:38 UTC 2009

The following commit has been merged in the master branch:
commit bf8ba16dbaf185a5fd6199e56aabf23b68afb4f5
Author: Jeremy Cowgar <jeremy at cowgar.com>
Date:   Thu Mar 26 20:06:50 2009 -0400

    * Initial import of the pcre binding

diff --git a/pcre/pcre.li b/pcre/pcre.li
new file mode 100644
index 0000000..21594c4
--- /dev/null
+++ b/pcre/pcre.li
@@ -0,0 +1,401 @@
+//
+// This code is released as-is for any purpose you see fit. Any
+// bug fixes or enhancements you may make, it would be kind to
+// contribute these back to me so others can benefit as well.
+//
+// Jeremy Cowgar <jeremy at cowgar.com>
+//
+
+Section Header
+  // Binding for PCRE (http://pcre.org) for Lisaac
+
+  + name      := PCRE;
+
+  - copyright := "2009 Jeremy Cowgar";
+
+  - comment   := "PCRE binding. Requires linking to libpcre.";
+
+  - external := `
+  #include <pcre.h>
+  const char *PCRE_error_message;
+  int PCRE_error_offset;
+  int PCRE_error_code;
+  int PCRE_ovector[30];
+  `;
+
+Section Inherit
+
+  - parent_object:OBJECT := OBJECT;
+
+Section Public
+  // Option Constants
+
+  - default         :INTEGER := `DEFAULT`:INTEGER;
+  - caseless        :INTEGER := `CASELESS`:INTEGER;
+  - multiline       :INTEGER := `MULTILINE`:INTEGER;
+  - dotall          :INTEGER := `DOTALL`:INTEGER;
+  - extended        :INTEGER := `EXTENDED`:INTEGER;
+  - anchored        :INTEGER := `ANCHORED`:INTEGER;
+  - dollar_endonly  :INTEGER := `DOLLAR_ENDONLY`:INTEGER;
+  - extra           :INTEGER := `EXTRA`:INTEGER;
+  - notbol          :INTEGER := `NOTBOL`:INTEGER;
+  - noteol          :INTEGER := `NOTEOL`:INTEGER;
+  - ungreedy        :INTEGER := `UNGREEDY`:INTEGER;
+  - notempty        :INTEGER := `NOTEMPTY`:INTEGER;
+  - utf8            :INTEGER := `UTF8`:INTEGER;
+  - no_auto_capture :INTEGER := `NO_AUTO_CAPTURE`:INTEGER;
+  - no_utf8_check   :INTEGER := `NO_UTF8_CHECK`:INTEGER;
+  - auto_callout    :INTEGER := `AUTO_CALLOUT`:INTEGER;
+  - partial         :INTEGER := `PARTIAL`:INTEGER;
+  - dfa_shortest    :INTEGER := `DFA_SHORTEST`:INTEGER;
+  - dfa_restart     :INTEGER := `DFA_RESTART`:INTEGER;
+  - firstline       :INTEGER := `FIRSTLINE`:INTEGER;
+  - dupnames        :INTEGER := `DUPNAMES`:INTEGER;
+  - newline_cr      :INTEGER := `NEWLINE_CR`:INTEGER;
+  - newline_lf      :INTEGER := `NEWLINE_LF`:INTEGER;
+  - newline_crlf    :INTEGER := `NEWLINE_CRLF`:INTEGER;
+  - newline_any     :INTEGER := `NEWLINE_ANY`:INTEGER;
+  - newline_anycrlf :INTEGER := `NEWLINE_ANYCRLF`:INTEGER;
+  - bsr_anycrlf     :INTEGER := `BSR_ANYCRLF`:INTEGER;
+  - bsr_unicode     :INTEGER := `BSR_UNICODE`:INTEGER;
+
+Section Private
+
+  + regex_pointer :POINTER := NULL;
+  // Internal pointer to the "pcre *" handle.
+
+Section Public
+
+  + pattern:ABSTRACT_STRING;
+  // Regular expression pattern
+
+  + error_message:ABSTRACT_STRING;
+  // Error message, if error_code is > 0
+
+  + error_offset:INTEGER := 0;
+  // Offset in the pattern where error occurred
+
+  + error_code:INTEGER := 0;
+  // Error code
+
+Section Private
+  // Internal setters used for creation slots
+
+  - set_regex_pointer value:POINTER <-
+  (
+    regex_pointer := value;
+  );
+
+  - set_pattern value:ABSTRACT_STRING <-
+  (
+    pattern := value;
+  );
+
+  - set_error_message value:ABSTRACT_STRING <-
+  (
+    error_message := value;
+  );
+
+  - set_error_offset value:INTEGER <-
+  (
+    error_offset := value;
+  );
+
+  - set_error_code value:INTEGER <-
+  (
+    error_code := value;
+  );
+
+Section Private
+  // Internal make slots
+
+  - make pattern:ABSTRACT_STRING pointer pointer:POINTER :SELF <-
+  (
+    + result:SELF;
+
+    result := SELF.clone;
+    result.set_pattern pattern;
+    result.set_regex_pointer pointer;
+    result
+  );
+
+  - make pattern:ABSTRACT_STRING code error_code:INTEGER offset error_offset:INTEGER message error_message:ABSTRACT_STRING :SELF <-
+  (
+    + result:SELF;
+
+    result := SELF.clone;
+    result.set_pattern pattern;
+    result.set_error_code error_code;
+    result.set_error_offset error_offset;
+    result.set_error_message error_message;
+    result
+  );
+
+Section Public
+  // Creation
+
+  - create pattern:ABSTRACT_STRING :SELF <-
+  // Create a new PCRE object.
+  //
+  // You should check the result.error_code. If not zero
+  // then an error occurred while compiling the pattern.
+  // You can retrieve detailed error information from
+  // the slots `error_code`, `error_message` and
+  // `error_offset`.
+  (
+    + pointer:POINTER;
+    + n_pattern:NATIVE_ARRAY[CHARACTER];
+    + error_message:STRING;
+    + error_code:INTEGER;
+    + error_offset:INTEGER;
+    + result:SELF;
+
+    n_pattern := pattern.to_external;
+
+    pointer := `pcre_compile2(@n_pattern, 0, &PCRE_error_code, &PCRE_error_message, &PCRE_error_offset, NULL)`:POINTER;
+    (pointer = 0). if {
+      error_code := `PCRE_error_code`:INTEGER;
+      error_offset := (`PCRE_error_offset`:INTEGER + 1);
+      error_message := STRING.clone;
+      error_message.from_external(`PCRE_error_message`:NATIVE_ARRAY[CHARACTER]);
+
+      result := SELF.make pattern code error_code offset error_offset message (error_message.to_string);
+    } else {
+      result := SELF.make pattern pointer pointer;
+    };
+
+    result
+  );
+
+  - clear <-
+  // Clear any match data.
+  (
+    matches.clear;
+  );
+
+Section Public
+  // Matching
+
+  + matches:FAST_ARRAY[INTEGER];
+  // Pairs of integers specifying the start and end of
+  // the match for each capture group. Index 0 and 1
+  // are always the entire match. Index 2 and 3 will
+  // be the first matched group, 4 and 5 the second
+  // matched group, etc...
+
+  - match subject:ABSTRACT_STRING since since:INTEGER options options:INTEGER :INTEGER <-
+  // Match against `subject` starting at `since` with the
+  // with options `options`.
+  //
+  // Returns the number of matches found.
+  (
+    + pointer:POINTER;
+    + n_subject:NATIVE_ARRAY[CHARACTER];
+    + subject_len:INTEGER;
+    + match_count:INTEGER;
+
+    pointer := regex_pointer;
+    n_subject := subject.to_external;
+    subject_len := subject.count;
+
+    match_count := `pcre_exec(@pointer, NULL, @n_subject, @subject_len, @since-1, @options, PCRE_ovector, 30)`:INTEGER;
+    (match_count > 0).if {
+      matches := FAST_ARRAY[INTEGER].create (match_count * 2);
+      0.to (matches.count - 1) by 2 do { idx:INTEGER;
+        matches.put (`PCRE_ovector[@idx]`:INTEGER + 1) to idx;
+        matches.put (`PCRE_ovector[@idx+1]`:INTEGER) to (idx+1);
+      };
+    } else {
+      matches := FAST_ARRAY[INTEGER].create 0;
+    };
+
+    match_count
+  );
+
+  - match subject:ABSTRACT_STRING since since:INTEGER :INTEGER <-
+  // Match gainst `subject` with default options starting at
+  // `since`.
+  //
+  // Returns the number of matches found.
+  (
+    match subject since since options 0
+  );
+
+  - match subject:ABSTRACT_STRING options options:INTEGER :INTEGER <-
+  // Match against `subject` with `options` starting at
+  // position 1 in the `subject`.
+  //
+  // Returns the number of matches found.
+  (
+    match subject since 1 options options
+  );
+
+  - match subject:ABSTRACT_STRING :INTEGER <-
+  // Match against `subject` with default options starting
+  // at position 1 in the `subject`.
+  //
+  // Returns the number of matches found.
+  (
+    match subject since 1 options 0
+  );
+
+  - count :INTEGER <-
+  // Number of matches
+  (
+    (matches.count + 1) / 2 // +1 deals with the zero based index
+  );
+
+  - item idx:INTEGER on subject:ABSTRACT_STRING :STRING <-
+  // Get a match item `idx` on `subject`. `idx` 0 being
+  // the entire match, 1 being the first group, 2 being
+  // the second, etc...
+  //
+  // Returns a new STRING.
+  (
+    + start:INTEGER;
+    + result:STRING;
+
+    (idx >= count).if {
+      result := STRING.clone;
+      result.make_empty;
+    } else {
+      start := idx * 2;
+
+      result := (subject.substring (matches.item start) to (matches.item (start + 1)));
+    };
+
+    result
+  );
+
+Section Private
+
+  - add_match subject:ABSTRACT_STRING match idx:INTEGER to result:STRING conversion convert:INTEGER :INTEGER <-
+  (
+    + value:STRING;
+    + new_convert:INTEGER;
+    + match_idx:INTEGER;
+
+    match_idx := idx * 2;
+
+    value := (subject.substring (matches.item match_idx) to (matches.item (match_idx + 1)));
+    new_convert := convert;
+
+    ( convert = 0 ).if {
+      result.append value;
+    }.elseif { convert = 1 } then {
+      value.put ((value.item 1).to_upper) to 1;
+      result.append value;
+      new_convert := 0;
+    }.elseif { convert = 2 } then {
+      value.put ((value.item 1).to_lower) to 1;
+      result.append value;
+      new_convert := 0;
+    }.elseif { convert = 3 } then {
+      value.to_upper;
+      result.append value;
+    }.elseif { convert = 4 } then {
+      value.to_lower;
+      result.append value;
+    };
+
+    new_convert
+  );
+
+Section Public
+  // Replacement
+  //
+  // All `replacement` values can contain a few key escaped characters:
+  //
+  // * \0 - recall entire match
+  // * \1..\9 - recall match 1 through 9
+  // * \u - convert the next character to upper case
+  // * \l - convert the next character to lower case
+  // * \U - convert all characters to upper case until a \E or \e is encountered
+  // * \L - convert all characters to lower case until a \E or \e is encountered
+  // * \E or \e - turn off upper or lower case conversion
+
+  - replace subject:ABSTRACT_STRING with replacement:ABSTRACT_STRING since since:INTEGER options options:INTEGER :STRING <-
+  // Replace `subject` with `replacement` starting at `since` with
+  // the match options `options`.
+  //
+  // Returns a new STRING.
+  (
+    + ch:CHARACTER;
+    + result:STRING;
+    + idx:INTEGER;
+    + convert:INTEGER; // 0=None, 1=Next Upper, 2=Next Lower, 3=Upper, 4=Lower
+
+    match subject since since options options;
+
+    result := STRING.clone;
+    idx := 1;
+    convert := 0;
+
+    { idx <= replacement.count }.while_do {
+      ch := replacement.item idx;
+
+      ( ch = '\\' ).if {
+        idx := idx + 1;
+        ch := replacement.item idx;
+
+        ( (ch >= '0') & (ch <= '9') ).if {
+          convert := add_match subject match (ch.to_integer - 48) to result conversion convert;
+        }.elseif { ch = 'u' } then {
+          convert := 1;
+        }.elseif { ch = 'l' } then {
+          convert := 2;
+        }.elseif { ch = 'U' } then {
+          convert := 3;
+        }.elseif { ch = 'L' } then {
+          convert := 4;
+        }.elseif { ch.to_upper = 'E' } then {
+          convert := 0;
+        };
+      } else {
+        ( convert = 1 ).if {
+          ch := ch.to_upper;
+          convert := 0;
+        }.elseif { convert = 2 } then {
+          ch := ch.to_lower;
+          convert := 0;
+        }.elseif { convert = 3 } then {
+          ch := ch.to_upper;
+        }.elseif { convert = 4 } then {
+          ch := ch.to_lower;
+        };
+
+        result.append_character ch;
+      };
+
+      idx := idx + 1;
+    };
+
+    result
+  );
+
+  - replace subject:ABSTRACT_STRING with replacement:ABSTRACT_STRING since since:INTEGER :STRING <-
+  // Replace `subject` with `replacement` starting at `since` with
+  // the default match options.
+  //
+  // Returns a new STRING.
+  (
+    replace subject with replacement since since options 0
+  );
+
+  - replace subject:ABSTRACT_STRING with replacement:ABSTRACT_STRING options options:INTEGER :STRING <-
+  // Replace `subject` with `replacement` starting at position
+  // 1 with the default match options.
+  //
+  // Returns a new STRING.
+  (
+    replace subject with replacement since 1 options options
+  );
+
+  - replace subject:ABSTRACT_STRING with replacement:ABSTRACT_STRING :STRING <-
+  // Replace `subject` with `replacement` starting at position
+  // 1 with the default match options.
+  //
+  // Returns a new STRING.
+  (
+    replace subject with replacement since 1 options 0
+  );
diff --git a/pcre/test.li b/pcre/test.li
new file mode 100644
index 0000000..120f689
--- /dev/null
+++ b/pcre/test.li
@@ -0,0 +1,86 @@
+Section Header
+
+  + name := TEST;
+
+  - copyright := "2009 Jeremy Cowgar";
+
+  - comment := "Unit test for PCRE";
+
+Section Public
+
+  - main <-
+  (
+    + regex:PCRE;
+    + name:STRING_CONSTANT;
+    + tmp:STRING;
+
+    UNIT_TEST.suite "PCRE";
+    UNIT_TEST.section "Creation";
+
+    regex := PCRE.create "([A-Z]+";
+    (regex.error_code = 0).if {
+      UNIT_TEST.test_failed "PCRE.create invalid pattern";
+    } else {
+      UNIT_TEST.test_passed "PCRE.create invalid pattern";
+      UNIT_TEST.test "PCRE.error_code" integer (regex.error_code) equals 14;
+      UNIT_TEST.test "PCRE.error_offset" integer (regex.error_offset) equals 8;
+      UNIT_TEST.test "PCRE.error_message" string (regex.error_message) equals "missing )";
+    };
+
+    regex := PCRE.create "([A-Za-z]+) ([A-Za-z]+)";
+    (regex.error_code = 0).if {
+      UNIT_TEST.test_passed "PCRE.create";
+    } else {
+      UNIT_TEST.test_failed "PCRE.create";
+    };
+
+    name := "John Doe";
+
+    UNIT_TEST.section "Basic Matching";
+    regex := PCRE.create "[a-z]+";
+    UNIT_TEST.test "PCRE.match" integer (regex.match name) equals 1;
+    UNIT_TEST.test "PCRE.matches count" integer (regex.matches.count) equals 2;
+    UNIT_TEST.test "PCRE.matches #1" integer (regex.matches.item 0) equals 2;
+    UNIT_TEST.test "PCRE.matches #2" integer (regex.matches.item 1) equals 4;
+    UNIT_TEST.test "PCRE.item #1" string (regex.item 0 on name) equals "ohn";
+
+    UNIT_TEST.test "PCRE.match #2" integer (regex.match name since 5) equals 1;
+    UNIT_TEST.test "PCRE.matches count #2" integer (regex.matches.count) equals 2;
+    UNIT_TEST.test "PCRE.matches #1" integer (regex.matches.item 0) equals 7;
+    UNIT_TEST.test "PCRE.matches #2" integer (regex.matches.item 1) equals 8;
+    UNIT_TEST.test "PCRE.item #1" string (regex.item 0 on name) equals "oe";
+
+    UNIT_TEST.section "Group Matching";
+    regex := PCRE.create "([A-Za-z]+) ([A-Za-z]+)";
+    UNIT_TEST.test "PCRE.match" integer (regex.match name) equals 3;
+    UNIT_TEST.test "PCRE.matches count" integer (regex.matches.count) equals 6;
+    UNIT_TEST.test "PCRE.matches #1" integer (regex.matches.item 0) equals 1;
+    UNIT_TEST.test "PCRE.matches #2" integer (regex.matches.item 1) equals 8;
+    UNIT_TEST.test "PCRE.matches #3" integer (regex.matches.item 2) equals 1;
+    UNIT_TEST.test "PCRE.matches #4" integer (regex.matches.item 3) equals 4;
+    UNIT_TEST.test "PCRE.matches #5" integer (regex.matches.item 4) equals 6;
+    UNIT_TEST.test "PCRE.matches #6" integer (regex.matches.item 5) equals 8;
+    UNIT_TEST.test "PCRE.item #1" string (regex.item 0 on name) equals "John Doe";
+    UNIT_TEST.test "PCRE.item #2" string (regex.item 1 on name) equals "John";
+    UNIT_TEST.test "PCRE.item #3" string (regex.item 2 on name) equals "Doe";
+
+    UNIT_TEST.section "Replacement";
+
+    regex := PCRE.create "([A-Za-z]+) ([A-Za-z]+)";
+    tmp := regex.replace "John Doe" with "\\2, \\1";
+    UNIT_TEST.test "PCRE.replace #1" string tmp equals "Doe, John";
+
+    tmp := regex.replace "john doe" with "\\U\\1\\e \\2";
+    UNIT_TEST.test "PCRE.replace #2" string tmp equals "JOHN doe";
+
+    tmp := regex.replace "john doe" with "\\u\\1 \\u\\2";
+    UNIT_TEST.test "PCRE.replace #3" string tmp equals "John Doe";
+
+    tmp := regex.replace "JOHN DOE" with "\\L\\1\\e \\2";
+    UNIT_TEST.test "PCRE.replace #4" string tmp equals "john DOE";
+
+    tmp := regex.replace "JOHN DOE" with "\\l\\1 \\l\\2";
+    UNIT_TEST.test "PCRE.replace #5" string tmp equals "jOHN dOE";
+
+    UNIT_TEST.test_results;
+  );

-- 
Lisaac small bindings