ARES-SDK/ARES/application/find.lsl

514 lines
14 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/* =========================================================================
*
* Nanite Systems Advanced Research Encapsulation System
*
* Copyright (c) 20222024 Nanite Systems Corporation
*
* =========================================================================
*
* Find Application
*
* This program is covered under the terms of the ARES Software Copyright
* License, Section 2 (ASCL-ii). Although it appears in ARES as part of
* commercial software, it may be used as the basis of derivative,
* non-profit works that retain a compatible license. Derivative works of
* ASCL-ii software must retain proper attribution in documentation and
* source files as described in the terms of the ASCL. Furthermore, they
* must be distributed free of charge and provided with complete, legible
* source code included in the package.
*
* To see the full text of the ASCL, type 'help license' on any standard
* ARES distribution, or visit http://nanite-systems.com/ASCL for the
* current version.
*
* DISCLAIMER
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS
* IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED.
*
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
* DAMAGES HOWEVER CAUSED ON ANY THEORY OF LIABILITY ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
* =========================================================================
*
*/
#include <ARES/a>
#define CLIENT_VERSION "0.0.3"
#define CLIENT_VERSION_TAGS "prealpha"
list argsplit(string m) {
// splits a string based on word boundaries, but groups terms inside double and single quotes
// tabs and newlines are converted to spaces
list results;
string TAB = llChar(9);
list atoms = llParseStringKeepNulls(m, [], [" ", "\\", "'", "\"", "\n", TAB]);
integer in_quotes; // 1 = single, 2 = double
integer ti = 0;
integer tmax = count(atoms);
string buffer;
while(ti < tmax) {
string t = gets(atoms, ti);
integer c = llOrd(t, 0);
if(c == 0x22) { // '"'
if(in_quotes == 0) {
in_quotes = 2;
} else if(in_quotes == 2) {
in_quotes = 0;
} else {
buffer += t;
}
} else if(c == 0x27) { // '\''
if(in_quotes == 0) {
in_quotes = 1;
} else if(in_quotes == 1) {
in_quotes = 0;
} else {
buffer += t;
}
} else if(c == 0x5c) { // '\\'
string t1 = gets(atoms, ti + 2);
if(t1 == "\"" || t1 == "'") {
buffer += t1;
ti += 2;
} else {
buffer += t;
}
} else if(c == 0x20 || c == 0x0a || c == 0x09) { // ' ', '\n', '\t'
if(in_quotes) {
buffer += " ";
} else {
results += buffer;
buffer = "";
}
} else {
buffer += t;
}
++ti;
}
if(tmax)
results += buffer;
return results;
}
// this is O(n^2), but the outer loop is usually sparse
// so, it might be faster than reversing the list and using a single normal search
/*
// no longer required; LL added llListFindListNext(haystack, needle, index) which can take -1 as index
integer nsListFindLast(list a, list b) {
if(a == [] || b == []) // either list is empty; bail
return NOWHERE;
integer i = llListFindList(a, b);
if(!~i) // doesn't exist; bail
return NOWHERE;
integer cb = count(b);
integer ca = count(a);
if(i == ca - cb) // first match is already at the end
return i;
if(!llListFindList(sublist(a, -cb, LAST), b)) // do we have a match at the end?
return ca - cb;
integer j = i;
list s;
while(~(i = llListFindList(s = llDeleteSubList(a, 0, j), b)))
j += i + 1;
return j;
} */
#define reverse_index(lis, item) llListFindListNext(lis, item, LAST)
integer first_is_skippable;
integer case_insensitive;
integer stop_after_first_match;
list states; // patterns matched at each offset
list exits; // next state of each state - stored as integer unless there are multiple hits
add_exit(integer where, integer to) {
if(!~where) {
#ifdef DEBUG
echo("can't patch -1 to " + (string)to);
#endif
return;
} else if(llGetListEntryType(exits, where) == TYPE_INTEGER) {
integer w = geti(exits, where);
if(~w) {
exits = alter(exits, [(string)to + "," + (string)w], where, where);
} else {
exits = alter(exits, [to], where, where);
}
} else {
exits = alter(exits, [(string)to + "," + gets(exits, where)], where, where);
}
}
construct_regex(string pattern) {
list paren_stack; // state indices corresponding to "(" symbols
list pipe_stack; // state indices corresponding to "|" symbols
first_is_skippable = 0; // set to TRUE if state 0 has ? or *
integer ci = 0;
integer cmax = strlen(pattern);
integer cc_open = NOWHERE; // character class open - position of '[' character
integer sc = 0; // number of states we've emitted so far
string buffer; // for character classes only
while(ci < cmax) {
string c = substr(pattern, ci, ci);
string d = substr(pattern, ci, ci + 1);
if(~cc_open) {
if(d == "\\-" || d == "\\\\") {
buffer += d;
ci += 1;
} else if(d == "\\^" /*|| d == "\\$" || d == "\\[" || d == "\\]" || d == "\\(" || d == "\\)" || d == "\\*" || d == "\\?" || d == "\\." || d == "\\\\"*/ || d == "\\]") {
buffer += substr(d, 1, 1);
ci += 1;
} else if(c == "]") {
string new_buffer = "\n" + substr(buffer, 0, 0);
integer bi = 1;
integer bmax = strlen(buffer);
integer found_hyphen = 0;
string last_e;
while(bi < bmax) {
string e = substr(buffer, bi, bi);
string f = substr(buffer, bi, bi + 1);
if(f == "\\$" || f == "\\[" || f == "\\(" || f == "\\)" || f == "\\*" || f == "\\?" || f == "\\." || f == "\\\\" || f == "\\-") {
e = substr(f, 1, 1);
++bi;
} else if(e == "-") {
found_hyphen = 2;
}
if(found_hyphen == 2) {
found_hyphen = 1;
} else if(found_hyphen == 1) {
integer ck = llOrd(last_e, 0) + 1;
integer end_char = llOrd(e, 0);
if(ck < end_char) {
while(ck < end_char)
new_buffer += llChar(ck++);
} else if(ck - 2 > end_char) {
ck -= 2;
while(ck > end_char)
new_buffer += llChar(ck--);
}
found_hyphen = 0;
new_buffer += e;
last_e = e;
} else {
new_buffer += e;
last_e = e;
}
++bi;
}
states += new_buffer;
exits += [NOWHERE];
add_exit(sc - 1, sc);
buffer = "";
++sc;
cc_open = NOWHERE;
} else {
buffer += c;
}
} else if(d == "[^" && !~cc_open) {
cc_open = ci;
buffer = "^";
ci += 1;
} else if(c == "[" && !~cc_open) {
cc_open = ci;
buffer = "[";
} else if(d == "\\^" || d == "\\$" || d == "\\[" || d == "\\]" || d == "\\(" || d == "\\)" || d == "\\*" || d == "\\?" || d == "\\." || d == "\\\\") {
states += d;
exits += [NOWHERE];
add_exit(sc - 1, sc);
++sc;
ci += 1;
} else if(c == "*") {
if(sc == 1)
first_is_skippable = TRUE;
else
add_exit(sc - 2, sc);
add_exit(sc - 1, sc - 1);
} else if(c == "+") {
add_exit(sc - 1, sc - 1);
} else if(c == "?") {
if(sc == 1)
first_is_skippable = TRUE;
else
add_exit(sc - 2, sc);
} else if(c == "(") {
paren_stack += [sc];
states += "";
exits += [NOWHERE];
add_exit(sc - 1, sc);
++sc;
} else if(d == ")?") {
// todo
ci += 1;
} else if(d == ")*") {
// todo
ci += 1;
} else if(c == ")") {
// todo
integer last_open = index(paren_stack, ")");
// find last "("
// find all subsequent "|"
// patch "(" to also point to "|"s
// patch symbols before "|" to also point to sc
// patch symbol before ")" to point to sc
} else if(c == "|") {
pipe_stack += [sc];
states += "";
exits += [NOWHERE];
add_exit(sc - 1, sc);
++sc;
} else if(d == "\\*" || d == "\\.") {
states += d;
//exits = alter(exits, [sc], sc - 1, sc - 1) + [NOWHERE];
add_exit(sc - 1, sc);
exits += [NOWHERE];
++sc;
} else {
states += c;
// exits = alter(exits, [sc], sc - 1, sc - 1) + [NOWHERE];
add_exit(sc - 1, sc);
exits += [NOWHERE];
++sc;
}
++ci;
}
// fix all jumps to nowhere
integer si = count(states);
while(si--) {
list next = split(gets(exits, si), ",");
integer affect = 0;
integer sin;
while(~(sin = index(next, (string)sc))) {
next = alter(next, ["-1"], sin, sin);
affect = 1;
}
if(affect) {
if(count(next) > 1)
exits = alter(exits, [concat(next, ",")], si, si);
else
exits = alter(exits, [(integer)gets(next, 0)], si, si);
#ifdef DEBUG
echo("fixed jump to nowhere in state " + (string)si);
#endif
}
}
list last_exits = split(gets(exits, LAST), ",");
integer nli = index(last_exits, (string)NOWHERE);
if(!~nli) {
exits = alter(exits, [concat(last_exits, ",") + ",-1"], LAST, LAST);
#ifdef DEBUG
echo("patched last exit");
#endif
}
}
integer match(string q, string t) {
if(q == ".")
return (t != "");
if(q == "\\^" || q == "\\$" || q == "\\[" || q == "\\]" || q == "\\(" || q == "\\)" || q == "\\*" || q == "\\?" || q == "\\." || q == "\\\\")
return (llOrd(t, 0) == llOrd(q, 1));
if(case_insensitive) {
q = llToLower(q);
t = llToLower(t);
}
if(llOrd(q, 0) == 0x0a) { // character ranges become '\n[' if positive and '\n^' if negative
if(llOrd(q, 1) == 0x5e) // '^'
return !~strpos(substr(q, 2, LAST), t);
else // implies '['
return TRUE && ~strpos(substr(q, 2, LAST), t);
}
return (q == t);
}
list find(string text) {
integer ti_start = NOWHERE;
integer ti;
integer tmax = strlen(text);
integer si;
string result;
list path;
integer smax = count(states);
integer last_exit = NOWHERE;
while(~si && ti < tmax) {
list next = split(gets(exits, si), ",");
string st = gets(states, si);
string tt = substr(text, ti, ti);
if(match(st, tt)) {
if(!count(path))
ti_start = ti;
result += tt;
#ifdef DEBUG
echo("matched \"" + (string)result + "\" ('" + tt + "' at char " + (string)ti + " vs. '" + st + "' from state " + (string)si + ")");
#endif
path += si;
si = geti(next, 0);
// todo: for backreferences this must be changed to the appropriate length
if(st != "")
++ti; // null symbols are legal but don't advance the character counter
if(!~si) {
return [ti_start, result];
}
} else if(si != last_exit) {
if(!count(path)) {
ti_start = ti = ti + 1;
si = 0;
next = split(gets(exits, 0), ",");
#ifdef DEBUG
echo("could not start matching with '" + tt + "' at char " + (string)ti);
#endif
} else {
integer prev = geti(path, LAST);
next = split(gets(exits, prev), ",");
#ifdef DEBUG
echo("Available 'next' options: " + concat(next, ","));
#endif
integer sj = index(next, (string)si);
next = delrange(next, 0, sj);
#ifdef DEBUG
echo("did not match ('" + tt + "' at char " + (string)ti + " vs. '" + st + "' from state " + (string)si + "); sj was " + (string)sj);
#endif
si = geti(next, 0);
}
} else {
integer dpi = count(path);
/* path = delitem(path, LAST);
result = delstring(result, LAST, LAST); */
integer dsi = si;
while(dpi--) {
integer dprev = geti(path, dpi);
list dnext = split(gets(exits, dprev), ",");
integer dlast_exit = geti(dnext, LAST);
if(dlast_exit != dsi) {
integer dsj = index(dnext, (string)dsi);
si = geti(dnext, dsj + 1);
#ifdef DEBUG
echo("backtracked to state " + (string)dprev + "; trying " + (string)si + "; dsj was " + (string)dsj);
#endif
jump got_it; // backtrack was successful
} else {
dsi = dprev;
string dst = gets(states, geti(path, LAST));
path = delitem(path, LAST);
// todo: for backreferences this must be changed to the appropriate length
if(dst != "")
result = delstring(result, LAST, LAST);
#ifdef DEBUG
echo("backtracked past " + (string)dprev + " = '" + gets(states, dprev) + "'");
#endif
}
}
// match failed - step forward
if(~ti_start) {
ti_start = ti = ti_start + 1;
} else {
ti += 1;
}
si = 0;
@got_it;
}
last_exit = geti(next, LAST);
}
return [NOWHERE, ""];
}
main(integer src, integer n, string m, key outs, key ins, key user) {
if(n == SIGNAL_INVOKE) {
list argv = argsplit(m);
integer argc = count(argv);
string msg = "";
case_insensitive = 0;
stop_after_first_match = 0;
list options = ["-i", "-1"];
integer opt;
while(~(opt = index(options, gets(argv, 1)))) {
argv = delitem(argv, 1);
--argc;
if(opt == 0) {
case_insensitive = 1;
} else if(opt == 1) {
stop_after_first_match = 1;
}
}
if(argc == 1) {
msg = "Syntax: " + PROGRAM_NAME + " [-i -1] <pattern> [<filename> ...]";
} else {
states = exits = [];
string pattern = gets(argv, 1);
argv = delitem(argv, 1);
llResetTime();
construct_regex(pattern);
// echo("Compiled regular expression in " + (string)llGetTime() + " sec"); // usually returns zero!
echo("States: " + concat(states, "•"));
echo("Exits: " + concat(exits, "•"));
string query_string;
if(ins == NULL_KEY) {
pipe_read(ins, query_string);
}
if(query_string == "") {
query_string = concat(delitem(argv, 0), " ");
}
msg = concat(find(query_string), ": ");
}
if(msg != "")
print(outs, user, msg);
} else if(n == SIGNAL_INIT) {
#ifdef DEBUG
echo("[" + PROGRAM_NAME + "] init event");
#endif
} else if(n == SIGNAL_UNKNOWN_SCRIPT) {
echo("[" + PROGRAM_NAME + "] failed to run '" + m + "' (kernel could not find the program specified)");
} else {
echo("[" + PROGRAM_NAME + "] unimplemented signal " + (string)n + ": " + m);
}
}
#include <ARES/program>