############################################################################ # # File: findre.icn # # Subject: Procedure to find regular expression # # Author: Richard L. Goerwitz # # Date: March 3, 1996 # ############################################################################ # # This file is in the public domain. # ############################################################################ # # Version: 1.17 # ############################################################################ # # DESCRIPTION: findre() is like the Icon builtin function find(), # except that it takes, as its first argument, a regular expression # pretty much like the ones the Unix egrep command uses (the few # minor differences are listed below). Its syntax is the same as # find's (i.e. findre(s1,s2,i,j)), with the exception that a no- # argument invocation wipes out all static structures utilized by # findre, and then forces a garbage collection. # ############################################################################ # # (For those not familiar with regular expressions and the Unix egrep # command: findre() offers a simple and compact wildcard-based search # system. If you do a lot of searches through text files, or write # programs which do searches based on user input, then findre is a # utility you might want to look over.) # # IMPORTANT DIFFERENCES between find and findre: As noted above, # findre() is just a find() function that takes a regular expression # as its first argument. One major problem with this setup is that # it leaves the user with no easy way to tab past a matched # substring, as with # # s ? write(tab(find("hello")+5)) # # In order to remedy this intrinsic deficiency, findre() sets the # global variable __endpoint to the first position after any given # match occurs. Use this variable with great care, preferably # assigning its value to some other variable immediately after the # match (for example, findre("hello [.?!]*",s) & tmp := __endpoint). # Otherwise, you will certainly run into trouble. (See the example # below for an illustration of how __endpoint is used). # # IMPORTANT DIFFERENCES between egrep and findre: findre utilizes # the same basic language as egrep. The only big difference is that # findre uses intrinsic Icon data structures and escaping conven- # tions rather than those of any particular Unix variant. Be care- # ful! If you put findre("\(hello\)",s) into your source file, # findre will treat it just like findre("(hello)",s). If, however, # you enter '\(hello\)' at run-time (via, say, findre(!&input,s)), # what Icon receives will depend on your operating system (most # likely, a trace will show "\\(hello\\)"). # ############################################################################ # # BUGS: Space has essentially been conserved at the expense of time # in the automata produced by findre(). The algorithm, in other # words, will produce the equivalent of a pushdown automaton under # certain circumstances, rather than strive (at the expense of space) # for full determinism. I tried to make up a nfa -> dfa converter # that would only create that portion of the dfa it needed to accept # or reject a string, but the resulting automaton was actually quite # slow (if anyone can think of a way to do this in Icon, and keep it # small and fast, please let us all know about it). Note that under # version 8 of Icon, findre takes up negligible storage space, due to # the much improved hashing algorithm. I have not tested it under # version 7, but I would expect it to use up quite a bit more space # in that environment. # # IMPORTANT NOTE: Findre takes a shortest-possible-match approach # to regular expressions. In other words, if you look for "a*", # findre will not even bother looking for an "a." It will just match # the empty string. Without this feature, findre would perform a bit # more slowly. The problem with such an approach is that often the # user will want to tab past the longest possible string of matched # characters (say tab((findre("a*|b*"), __endpoint)). In circumstan- # ces like this, please just use something like: # # s ? { # tab(find("a")) & # or use Arb() from the IPL (patterns.icn) # tab(many('a')) # tab(many('b')) # } # # or else use some combination of findre and the above. # ############################################################################ # # REGULAR EXPRESSION SYNTAX: Regular expression syntax is complex, # and yet simple. It is simple in the sense that most of its power # is concentrated in about a dozen easy-to-learn symbols. It is # complex in the sense that, by combining these symbols with # characters, you can represent very intricate patterns. # # I make no pretense here of offering a full explanation of regular # expressions, their usage, and the deeper nuances of their syntax. # As noted above, this should be gleaned from a Unix manual. For # quick reference, however, I have included a brief summary of all # the special symbols used, accompanied by an explanation of what # they mean, and, in some cases, of how they are used (most of this # is taken from the comments prepended to Jerry Nowlin's Icon-grep # command, as posted a couple of years ago): # # ^ - matches if the following pattern is at the beginning # of a line (i.e. ^# matches lines beginning with "#") # $ - matches if the preceding pattern is at the end of a line # . - matches any single character # + - matches from 1 to any number of occurrences of the # previous expression (i.e. a character, or set of paren- # thesized/bracketed characters) # * - matches from 0 to any number of occurrences of the previous # expression # \ - removes the special meaning of any special characters # recognized by this program (i.e if you want to match lines # beginning with a "[", write ^\[, and not ^[) # | - matches either the pattern before it, or the one after # it (i.e. abc|cde matches either abc or cde) # [] - matches any member of the enclosed character set, or, # if ^ is the first character, any nonmember of the # enclosed character set (i.e. [^ab] matches any character # _except_ a and b). # () - used for grouping (e.g. ^(abc|cde)$ matches lines consist- # ing of either "abc" or "cde," while ^abc|cde$ matches # lines either beginning with "abc" or ending in "cde") # ############################################################################ # # EXAMPLE program: # # procedure main(a) # while line := !&input do { # token_list := tokenize_line(line,a[1]) # every write(!token_list) # } # end # # procedure tokenize_line(s,sep) # tmp_lst := [] # s ? { # while field := tab(findre(sep)|0) & # mark := __endpoint # do { # put(tmp_lst,"" ~== field) # if pos(0) then break # else tab(mark) # } # } # return tmp_lst # end # # The above program would be compiled with findre (e.g. "icont # test_prg.icn findre.icn") to produce a single executable which # tokenizes each line of input based on a user-specified delimiter. # Note how __endpoint is set soon after findre() succeeds. Note # also how empty fields are excluded with "" ~==, etc. Finally, note # that the temporary list, tmp_lst, is not needed. It is included # here merely to illustrate one way in which tokens might be stored. # # Tokenizing is, of course, only one of many uses one might put # findre to. It is very helpful in allowing the user to construct # automata at run-time. If, say, you want to write a program that # searches text files for patterns given by the user, findre would be # a perfect utility to use. Findre in general permits more compact # expression of patterns than one can obtain using intrinsic Icon # scanning facilities. Its near complete compatibility with the Unix # regexp library, moreover, makes for greater ease of porting, # especially in cases where Icon is being used to prototype C code. # ############################################################################ global state_table, parends_present, slash_present global biggest_nonmeta_str, __endpoint record o_a_s(op,arg,state) procedure findre(re, s, i, j) local p, default_val, x, nonmeta_len, tokenized_re, tmp static FSTN_table, STRING_table initial { FSTN_table := table() STRING_table := table() } if /re then { FSTN_table := table() STRING_table := table() collect() # do it *now* return } if /s := &subject then default_val := &pos else default_val := 1 if \i then { if i < 1 then i := *s + (i+1) } else i := default_val if \j then { if j < 1 then j := *s + (j+1) } else j := *s+1 if /FSTN_table[re] then { # If we haven't seen this re before, then... if \STRING_table[re] then { # ...if it's in the STRING_table, use plain find() every p := find(STRING_table[re],s,i,j) do { __endpoint := p + *STRING_table[re]; suspend p } fail } else { # However, if it's not in the string table, we have to # tokenize it and check for metacharacters. If it has # metas, we create an FSTN, and put that into FSTN_table; # otherwise, we just put it into the STRING_table. tokenized_re := tokenize(re) if 0 > !tokenized_re then { # if at least one element is < 0, re has metas MakeFSTN(tokenized_re) | err_out(re,2) # both biggest_nonmeta_str and state_table are global /FSTN_table[re] := [.biggest_nonmeta_str, copy(state_table)] } else { # re has no metas; put the input string into STRING_table # for future reference, and execute find() at once tmp := ""; every tmp ||:= char(!tokenized_re) insert(STRING_table,re,tmp) every p := find(STRING_table[re],s,i,j) do { __endpoint := p + *STRING_table[re]; suspend p } fail } } } if nonmeta_len := (1 < *FSTN_table[re][1]) then { # If the biggest non-meta string in the original re # was more than 1, then put in a check for it... s[1:j] ? { tab(x := i to j - nonmeta_len) & (find(FSTN_table[re][1]) | fail) \ 1 & (__endpoint := apply_FSTN(&null,FSTN_table[re][2])) & (suspend x) } } else { #...otherwise it's not worth worrying about the biggest nonmeta str s[1:j] ? { tab(x := i to j) & (__endpoint := apply_FSTN(&null,FSTN_table[re][2])) & (suspend x) } } end procedure apply_FSTN(ini,tbl) local biggest_pos, POS, tmp, fin static s_tbl /ini := 1 & s_tbl := tbl & biggest_pos := 1 if ini = 0 then { return &pos } POS := &pos fin := 0 repeat { if tmp := !s_tbl[ini] & tab(tmp.op(tmp.arg)) then { if tmp.state = fin then return &pos else ini := tmp.state } else (&pos := POS, fail) } end procedure tokenize(s) local token_list, chr, tmp, b_loc, next_one, fixed_length_token_list, i token_list := list() s ? { tab(many('*+?|')) while chr := move(1) do { if chr == "\\" # it can't be a metacharacter; remove the \ and "put" # the integer value of the next chr into token_list then put(token_list,ord(move(1))) | err_out(s,2,chr) else if any('*+()|?.$^',chr) then { # Yuck! Egrep compatibility stuff. case chr of { "*" : { tab(many('*+?')) put(token_list,-ord("*")) } "+" : { tmp := tab(many('*?+')) | &null if upto('*?',\tmp) then put(token_list,-ord("*")) else put(token_list,-ord("+")) } "?" : { tmp := tab(many('*?+')) | &null if upto('*+',\tmp) then put(token_list,-ord("*")) else put(token_list,-ord("?")) } "(" : { tab(many('*+?')) put(token_list,-ord("(")) } default: { put(token_list,-ord(chr)) } } } else { case chr of { # More egrep compatibility stuff. "[" : { b_loc := find("[") | *&subject+1 every next_one := find("]",,,b_loc) \next_one ~= &pos | err_out(s,2,chr) put(token_list,-ord(chr)) } "]" : { if &pos = (\next_one+1) then put(token_list,-ord(chr)) & next_one := &null else put(token_list,ord(chr)) } default: put(token_list,ord(chr)) } } } } token_list := UnMetaBrackets(token_list) fixed_length_token_list := list(*token_list) every i := 1 to *token_list do fixed_length_token_list[i] := token_list[i] return fixed_length_token_list end procedure UnMetaBrackets(l) # Since brackets delineate a cset, it doesn't make # any sense to have metacharacters inside of them. # UnMetaBrackets makes sure there are no metacharac- # ters inside of the braces. local tmplst, i, Lb, Rb tmplst := list(); i := 0 Lb := -ord("[") Rb := -ord("]") while (i +:= 1) <= *l do { if l[i] = Lb then { put(tmplst,l[i]) until l[i +:= 1] = Rb do put(tmplst,abs(l[i])) put(tmplst,l[i]) } else put(tmplst,l[i]) } return tmplst end procedure MakeFSTN(l,INI,FIN) # MakeFSTN recursively descends through the tree structure # implied by the tokenized string, l, recording in (global) # fstn_table a list of operations to be performed, and the # initial and final states which apply to them. local i, inter, inter2, tmp, Op, Arg static Lp, Rp, Sl, Lb, Rb, Caret_inside, Dot, Dollar, Caret_outside # global biggest_nonmeta_str, slash_present, parends_present initial { Lp := -ord("("); Rp := -ord(")") Sl := -ord("|") Lb := -ord("["); Rb := -ord("]"); Caret_inside := ord("^") Dot := -ord("."); Dollar := -ord("$"); Caret_outside := -ord("^") } /INI := 1 & state_table := table() & NextState("new") & biggest_nonmeta_str := "" /FIN := 0 # I haven't bothered to test for empty lists everywhere. if *l = 0 then { /state_table[INI] := [] put(state_table[INI],o_a_s(zSucceed,&null,FIN)) return } # HUNT DOWN THE SLASH (ALTERNATION OPERATOR) every i := 1 to *l do { if l[i] = Sl & tab_bal(l,Lp,Rp) = i then { if i = 1 then err_out(l,2,char(abs(l[i]))) else { /slash_present := "yes" inter := NextState() inter2:= NextState() MakeFSTN(l[1:i],inter2,FIN) MakeFSTN(l[i+1:0],inter,FIN) /state_table[INI] := [] put(state_table[INI],o_a_s(apply_FSTN,inter2,0)) put(state_table[INI],o_a_s(apply_FSTN,inter,0)) return } } } # HUNT DOWN PARENTHESES if l[1] = Lp then { i := tab_bal(l,Lp,Rp) | err_out(l,2,"(") inter := NextState() if any('*+?',char(abs(0 > l[i+1]))) then { case l[i+1] of { -ord("*") : { /state_table[INI] := [] put(state_table[INI],o_a_s(apply_FSTN,inter,0)) MakeFSTN(l[2:i],INI,INI) MakeFSTN(l[i+2:0],inter,FIN) return } -ord("+") : { inter2 := NextState() /state_table[inter2] := [] MakeFSTN(l[2:i],INI,inter2) put(state_table[inter2],o_a_s(apply_FSTN,inter,0)) MakeFSTN(l[2:i],inter2,inter2) MakeFSTN(l[i+2:0],inter,FIN) return } -ord("?") : { /state_table[INI] := [] put(state_table[INI],o_a_s(apply_FSTN,inter,0)) MakeFSTN(l[2:i],INI,inter) MakeFSTN(l[i+2:0],inter,FIN) return } } } else { MakeFSTN(l[2:i],INI,inter) MakeFSTN(l[i+1:0],inter,FIN) return } } else { # I.E. l[1] NOT = Lp (left parenthesis as -ord("(")) every i := 1 to *l do { case l[i] of { Lp : { inter := NextState() MakeFSTN(l[1:i],INI,inter) /parends_present := "yes" MakeFSTN(l[i:0],inter,FIN) return } Rp : err_out(l,2,")") } } } # NOW, HUNT DOWN BRACKETS if l[1] = Lb then { i := tab_bal(l,Lb,Rb) | err_out(l,2,"[") inter := NextState() tmp := ""; every tmp ||:= char(l[2 to i-1]) if Caret_inside = l[2] then tmp := ~cset(Expand(tmp[2:0])) else tmp := cset(Expand(tmp)) if any('*+?',char(abs(0 > l[i+1]))) then { case l[i+1] of { -ord("*") : { /state_table[INI] := [] put(state_table[INI],o_a_s(apply_FSTN,inter,0)) put(state_table[INI],o_a_s(any,tmp,INI)) MakeFSTN(l[i+2:0],inter,FIN) return } -ord("+") : { inter2 := NextState() /state_table[INI] := [] put(state_table[INI],o_a_s(any,tmp,inter2)) /state_table[inter2] := [] put(state_table[inter2],o_a_s(apply_FSTN,inter,0)) put(state_table[inter2],o_a_s(any,tmp,inter2)) MakeFSTN(l[i+2:0],inter,FIN) return } -ord("?") : { /state_table[INI] := [] put(state_table[INI],o_a_s(apply_FSTN,inter,0)) put(state_table[INI],o_a_s(any,tmp,inter)) MakeFSTN(l[i+2:0],inter,FIN) return } } } else { /state_table[INI] := [] put(state_table[INI],o_a_s(any,tmp,inter)) MakeFSTN(l[i+1:0],inter,FIN) return } } else { # I.E. l[1] not = Lb every i := 1 to *l do { case l[i] of { Lb : { inter := NextState() MakeFSTN(l[1:i],INI,inter) MakeFSTN(l[i:0],inter,FIN) return } Rb : err_out(l,2,"]") } } } # FIND INITIAL SEQUENCES OF POSITIVE INTEGERS, CONCATENATE THEM if i := match_positive_ints(l) then { inter := NextState() tmp := Ints2String(l[1:i]) # if a slash has been encountered already, forget optimizing # in this way; if parends are present, too, then forget it, # unless we are at the beginning or end of the input string if INI = 1 | FIN = 2 | /parends_present & /slash_present & *tmp > *biggest_nonmeta_str then biggest_nonmeta_str := tmp /state_table[INI] := [] put(state_table[INI],o_a_s(match,tmp,inter)) MakeFSTN(l[i:0],inter,FIN) return } # OKAY, CLEAN UP ALL THE JUNK THAT'S LEFT i := 0 while (i +:= 1) <= *l do { case l[i] of { Dot : { Op := any; Arg := &cset } Dollar : { Op := pos; Arg := 0 } Caret_outside: { Op := pos; Arg := 1 } default : { Op := match; Arg := char(0 < l[i]) } } | err_out(l,2,char(abs(l[i]))) inter := NextState() if any('*+?',char(abs(0 > l[i+1]))) then { case l[i+1] of { -ord("*") : { /state_table[INI] := [] put(state_table[INI],o_a_s(apply_FSTN,inter,0)) put(state_table[INI],o_a_s(Op,Arg,INI)) MakeFSTN(l[i+2:0],inter,FIN) return } -ord("+") : { inter2 := NextState() /state_table[INI] := [] put(state_table[INI],o_a_s(Op,Arg,inter2)) /state_table[inter2] := [] put(state_table[inter2],o_a_s(apply_FSTN,inter,0)) put(state_table[inter2],o_a_s(Op,Arg,inter2)) MakeFSTN(l[i+2:0],inter,FIN) return } -ord("?") : { /state_table[INI] := [] put(state_table[INI],o_a_s(apply_FSTN,inter,0)) put(state_table[INI],o_a_s(Op,Arg,inter)) MakeFSTN(l[i+2:0],inter,FIN) return } } } else { /state_table[INI] := [] put(state_table[INI],o_a_s(Op,Arg,inter)) MakeFSTN(l[i+1:0],inter,FIN) return } } # WE SHOULD NOW BE DONE INSERTING EVERYTHING INTO state_table # IF WE GET TO HERE, WE'VE PARSED INCORRECTLY! err_out(l,4) end procedure NextState(new) static nextstate if \new then nextstate := 1 else nextstate +:= 1 return nextstate end procedure err_out(x,i,elem) writes(&errout,"Error number ",i," parsing ",image(x)," at ") if \elem then write(&errout,image(elem),".") else write(&errout,"(?).") exit(i) end procedure zSucceed() return .&pos end procedure Expand(s) local s2, c1, c2 s2 := "" s ? { s2 ||:= ="^" s2 ||:= ="-" while s2 ||:= tab(find("-")-1) do { if (c1 := move(1), ="-", c2 := move(1), c1 << c2) then every s2 ||:= char(ord(c1) to ord(c2)) else s2 ||:= 1(move(2), not(pos(0))) | err_out(s,2,"-") } s2 ||:= tab(0) } return s2 end procedure tab_bal(l,i1,i2) local i, i1_count, i2_count i := 0 i1_count := 0; i2_count := 0 while (i +:= 1) <= *l do { case l[i] of { i1 : i1_count +:= 1 i2 : i2_count +:= 1 } if i1_count = i2_count then suspend i } end procedure match_positive_ints(l) # Matches the longest sequence of positive integers in l, # beginning at l[1], which neither contains, nor is fol- # lowed by a negative integer. Returns the first position # after the match. Hence, given [55, 55, 55, -42, 55], # match_positive_ints will return 3. [55, -42] will cause # it to fail rather than return 1 (NOTE WELL!). local i every i := 1 to *l do { if l[i] < 0 then return (3 < i) - 1 | fail } return *l + 1 end procedure Ints2String(l) local tmp tmp := "" every tmp ||:= char(!l) return tmp end procedure StripChar(s,s2) local tmp if find(s2,s) then { tmp := "" s ? { while tmp ||:= tab(find("s2")) do tab(many(cset(s2))) tmp ||:= tab(0) } } return \tmp | s end