############################################################################ # # File: regexp.icn # # Subject: Procedure for regular-expression pattern matching # # Author: Robert J. Alexander # # Date: May 19, 1996 # ############################################################################ # # This file is in the public domain. # ############################################################################ # # This is a kit of procedures to deal with UNIX-like regular expression # patterns. # # These procedures are interesting partly because of the "recursive # suspension" (or "suspensive recursion" :-) technique used to simulate # conjunction of an arbitrary number of computed expressions (see # notes, below). # # The public procedures are: # # ReMatch(pattern,s,i1,i2) : i3,i4,...,iN # ReFind(pattern,s,i1,i2) : i3,i4,...,iN # RePat(s) : pattern list # ############################################################################ # # ReMatch() produces the sequence of positions in "s" past a substring # starting at "i1" that matches "pattern", but fails if there is no # such position. Similar to match(), but is capable of generating # multiple positions. # # ReFind() produces the sequence of positions in "s" where substrings # begin that match "pattern", but fails if there is no such position. # Similar to find(). Each position is produced only once, even if # several possible matches are possible at that position. # # "pattern" can be either a string or a pattern list -- see RePat(), # below. # # Default values of s, i1, and i2 are handled as for Icon's built-in # string scanning procedures such as match(). # ############################################################################ # # RePat(s) : L # # Creates a pattern element list from pattern string "s", but fails if # the pattern string is not syntactically correct. ReMatch() and # ReFind() will automatically convert a pattern string to a pattern # list, but it is faster to do the conversion explicitly if multiple # operations are done using the same pattern. An additional advantage # to compiling the pattern separately is avoiding ambiguity of failure # caused by an incorrect pattern and failure to match a correct pattern. # ############################################################################ # # ReCaseIndependent() : n # ReCaseDependent() : n # # Set mode for case-independent or case-dependent matching. The initial # mode is case-dependent. # ############################################################################ # # Accessible Global Variables # # After a match, the strings matched by parenthesized regular # expressions are left in list "Re_ParenGroups", and can be accessed by # subscripting in using the same number as the \N construct. # # If it is desired that regular expression format be similar to UNIX # filename generation patterns but still retain the power of full # regular expressions, make the following assignments prior to # compiling the pattern string: # # Re_ArbString := "*" # Defaults to ".*" # # The sets of characters (csets) that define a word, digits, and white # space can be modified. The following assignments can be made before # compiling the pattern string. The character sets are captured when # the pattern is compiled, so changing them after pattern compilation # will not alter the behavior of matches unless the pattern string is # recompiled. # # Re_WordChars := 'whatever you like' # # Defaults to &letters ++ &digits ++ "_" # Re_Digits := &digits ++ 'ABCDEFabcdef' # # Defaults to &digits # Re_Space := 'whatever you like' # # Defaults to ' \t\v\n\r\f' # # These globals are normally not initialized until the first call to # RePat(), and then only if they are null. They can be explicitly # initialized to their defaults (if they are null) by calling # Re_Default(). # ############################################################################ # # Characters compiled into patterns can be passed through a # user-supplied filter procedure, provided in global variable # Re_Filter. The filtering is done before the characters are bound # into the pattern. The filter proc is passed one argument, the string # to filter, and it must return the filtered string as its result. If # the filter proc fails, the string will be used unfiltered. The # filter proc is called with an argument of either type string (for # characters in the pattern) or cset (for character classes [...]). # # Filtering is done only as the pattern is compiled. Any filtering of # strings to be matched must be explicitly done. # ############################################################################ # # By default, individual pattern elements are matched in a "leftmost- # longest-first" sequence, which is the order observed by perl, egrep, # and most other regular expression matchers. If the order of matching # is not important a performance improvement might be seen if pattern # elements are matched in "shortest-first" order. The following global # variable setting causes the matcher to operate in leftmost-shortest- # first order. # # Re_LeftmostShortest := 1 # ############################################################################ # # In the case of patterns containing alternation, ReFind() will # generally not produce positions in increasing order, but will produce # all positions from the first term of the alternation (in increasing # order) followed by all positions from the second (in increasing # order). If it is necessary that the positions be generated in # strictly increasing order, with no duplicates, assign any non-null # value to Re_Ordered: # # Re_Ordered := 1 # # If the Re_Ordered option is chosen, there is a *small* penalty in # efficiency in some cases, and the co-expression facility is required # in your Icon implementation. # ############################################################################ # # Regular Expression Characters and Features Supported # # The regular expression format supported by procedures in this file # model very closely those supported by the UNIX "egrep" program, with # modifications as described in the Perl programming language # definition. Following is a brief description of the special # characters used in regular expressions. In the description, the # abbreviation RE means regular expression. # # c An ordinary character (not one of the special characters # discussed below) is a one-character RE that matches that # character. # # \c A backslash followed by any special character is a one- # character RE that matches the special character itself. # # Note that backslash escape sequences representing # non-graphic characters are not supported directly # by these procedures. Of course, strings coded in an # Icon program will have such escapes handled by the # Icon translator. If such escapes must be supported # in strings read from the run-time environment (e.g. # files), they will have to be converted by other means, # such as the Icon Program Library procedure "escape()". # # . A period is a one-character RE that matches any # character. # # [string] A non-empty string enclosed in square brackets is a one- # character RE that matches any *one* character of that # string. If, the first character is "^" (circumflex), # the RE matches any character not in the remaining # characters of the string. The "-" (minus), when between # two other characters, may be used to indicate a range of # consecutive ASCII characters (e.g. [0-9] is equivalent to # [0123456789]). Other special characters stand for # themselves in a bracketed string. # # * Matches zero or more occurrences of the RE to its left. # # + Matches one or more occurrences of the RE to its left. # # ? Matches zero or one occurrences of the RE to its left. # # {N} Matches exactly N occurrences of the RE to its left. # # {N,} Matches at least N occurrences of the RE to its left. # # {N,M} Matches at least N occurrences but at most M occurrences # of the RE to its left. # # ^ A caret at the beginning of an entire RE constrains # that RE to match an initial substring of the subject # string. # # $ A currency symbol at the end of an entire RE constrains # that RE to match a final substring of the subject string. # # | Alternation: two REs separated by "|" match either a # match for the first or a match for the second. # # () A RE enclosed in parentheses matches a match for the # regular expression (parenthesized groups are used # for grouping, and for accessing the matched string # subsequently in the match using the \N expression). # # \N Where N is a digit in the range 1-9, matches the same # string of characters as was matched by a parenthesized # RE to the left in the same RE. The sub-expression # specified is that beginning with the Nth occurrence # of "(" counting from the left. E.g., ^(.*)\1$ matches # a string consisting of two consecutive occurrences of # the same string. # ############################################################################ # # Extensions beyond UNIX egrep # # The following extensions to UNIX REs, as specified in the Perl # programming language, are supported. # # \w Matches any alphanumeric (including "_"). # \W Matches any non-alphanumeric. # # \b Matches only at a word-boundary (word defined as a string # of alphanumerics as in \w). # \B Matches only non-word-boundaries. # # \s Matches any white-space character. # \S Matches any non-white-space character. # # \d Matches any digit [0-9]. # \D Matches any non-digit. # # \w, \W, \s, \S, \d, \D can be used within [string] REs. # ############################################################################ # # Notes on computed conjunction expressions by "suspensive recursion" # # A conjunction expression of an arbitrary number of terms can be # computed in a looping fashion by the following recursive technique: # # procedure Conjunct(v) # if then # suspend Conjunct() # else # suspend v # end # # The argument "v" is needed for producing the value of the last term # as the value of the conjunction expression, accurately modeling Icon # conjunction. If the value of the conjunction is not needed, the # technique can be slightly simplified by eliminating "v": # # procedure ConjunctAndProduceNull() # if then # suspend ConjunctAndProduceNull() # else # suspend # end # # Note that must still remain in the suspend # expression to test for failure of the term, although its value is not # passed to the recursive invocation. This could have been coded as # # suspend & ConjunctAndProduceNull() # # but wouldn't have been as provocative. # # Since the computed conjunctions in this program are evaluated only for # their side effects, the second technique is used in two situations: # # (1) To compute the conjunction of all of the elements in the # regular expression pattern list (Re_match1()). # # (2) To evaluate the "exactly N times" and "N to M times" # control operations (Re_NTimes()). # ############################################################################ record Re_Tok(proc,args) global Re_ParenGroups,Re_Filter,Re_Ordered global Re_WordChars,Re_NonWordChars global Re_Space,Re_NonSpace global Re_Digits,Re_NonDigits global Re_ArbString,Re_AnyString global Re_LeftmostShortest invocable "=":1 ################### Pattern Translation Procedures ################### procedure RePat(s) #: regular expression pattern list # # Produce pattern list representing pattern string s. # # # Create a list of pattern elements. Pattern strings are parsed # and converted into list elements as shown in the following table. # Since some list elements reference other pattern lists, the # structure is really a tree. # # Token Generates Matches... # ----- --------- ---------- # ^ Re_Tok(pos,[1]) Start of string or line # $ Re_Tok(pos,[0]) End of string or line # . Re_Tok(move,[1]) Any single character # + Re_Tok(Re_OneOrMore,[tok]) At least one occurrence of # previous token # * Re_Tok(Re_ArbNo,[tok]) Zero or more occurrences of # previous token # | Re_Tok(Re_Alt,[pattern,pattern]) Either of prior expression # or next expression # [...] Re_Tok(Re_TabAny,[cset]) Any single character in # specified set (see below) # (...) Re_Tok(Re_MatchReg,[pattern]) Parenthesized pattern as # single token # The string of no-special # Re_Tok(Re__tabmatch,string) characters # \b Re_Tok(Re_WordBoundary,[Re_WordChars,Re_NonWordChars]) # A word-boundary # (word default: [A-Za-z0-9_]+) # \B Re_Tok(Re_NonWordBoundary,[Re_WordChars,Re_NonWordChars]) # A non-word-boundary # \w Re_Tok(Re_TabAny,[Re_WordChars])A word-character # \W Re_Tok(Re_TabAny,[Re_NonWordChars]) A non-word-character # \s Re_Tok(Re_TabAny,[Re_Space]) A space-character # \S Re_Tok(Re_TabAny,[Re_NonSpace]) A non-space-character # \d Re_Tok(Re_TabAny,[Re_Digits]) A digit # \D Re_Tok(Re_TabAny,[Re_NonDigits]) A non-digit # {n,m} Re_Tok(Re_NToMTimes,[tok,n,m]) n to m occurrences of # previous token # {n,} Re_Tok(Re_NOrMoreTimes,[tok,n]) n or more occurrences of # previous token # {n} Re_Tok(Re_NTimes,[tok,n]) exactly n occurrences of # previous token # ? Re_Tok(Re_ZeroOrOneTimes,[tok]) one or zero occurrences of # previous token # \ Re_Tok(Re_MatchParenGroup,[n]) The string matched by # parenthesis group # local plist static lastString,lastPList # # Initialize. # initial { Re_Default() lastString := "" lastPList := [] } if s === lastString then return lastPList Re_WordChars := cset(Re_WordChars) Re_NonWordChars := ~Re_WordChars Re_Space := cset(Re_Space) Re_NonSpace := ~Re_Space Re_Digits := cset(Re_Digits) Re_NonDigits := ~Re_Digits s ? (plist := Re_pat1(0)) | fail lastString := s lastPList := plist return plist end procedure Re_pat1(level) # L # # Recursive portion of RePat() # local plist,n,m,c,comma static parenNbr initial { if /Re__match then ReCaseDependent() } if level = 0 then parenNbr := 0 plist := [] # # Loop to put pattern elements on list. # until pos(0) do { (="|",plist := [Re_Tok(Re_Alt,[plist,Re_pat1(level + 1) | fail])]) | put(plist, (="^",pos(2) | &subject[-2] == ("|" | "("),Re_Tok(pos,[1])) | (="$",pos(0) | match("|" | ")"),Re_Tok(pos,[0])) | (match(")"),level > 0,break) | (=Re_ArbString,Re_Tok(Re_Arb)) | (=Re_AnyString,Re_Tok(move,[1])) | (="+",Re_Tok(Re_OneOrMore,[Re_prevTok(plist) | fail])) | (="*",Re_Tok(Re_ArbNo,[Re_prevTok(plist) | fail])) | 1(Re_Tok(Re_TabAny,[c := Re_cset()]),\c | fail) | 3(="(",n := parenNbr +:= 1, Re_Tok(Re_MatchReg,[Re_pat1(level + 1) | fail,n]), move(1) | fail) | (="\\b",Re_Tok(Re_WordBoundary,[Re_WordChars,Re_NonWordChars])) | (="\\B",Re_Tok(Re_NonWordBoundary,[Re_WordChars,Re_NonWordChars])) | (="\\w",Re_Tok(Re_TabAny,[Re_WordChars])) | (="\\W",Re_Tok(Re_TabAny,[Re_NonWordChars])) | (="\\s",Re_Tok(Re_TabAny,[Re_Space])) | (="\\S",Re_Tok(Re_TabAny,[Re_NonSpace])) | (="\\d",Re_Tok(Re_TabAny,[Re_Digits])) | (="\\D",Re_Tok(Re_TabAny,[Re_NonDigits])) | (="{",(n := tab(many(&digits)),comma := =(",") | &null, m := tab(many(&digits)) | &null,="}") | fail, if \m then Re_Tok(Re_NToMTimes, [Re_prevTok(plist),integer(n),integer(m)]) else if \comma then Re_Tok(Re_NOrMoreTimes, [Re_prevTok(plist),integer(n)]) else Re_Tok(Re_NTimes,[Re_prevTok(plist),integer(n)])) | (="?",Re_Tok(Re_ZeroOrOneTimes,[Re_prevTok(plist) | fail])) | Re_Tok(Re__tabmatch,[Re_string(level)]) | (="\\",n := tab(any(&digits)),Re_Tok(Re_MatchParenGroup,[integer(n)])) ) | fail } return plist end procedure Re_prevTok(plist) # # Pull previous token from the pattern list. This procedure must take # into account the fact that successive character tokens have been # optimized into a single string token. # local lastTok,s,r lastTok := pull(plist) | fail if lastTok.proc === Re__tabmatch then { s := lastTok.args[1] r := Re_Tok(Re__tabmatch,[s[-1]]) s[-1] := "" if *s > 0 then { put(plist,lastTok) lastTok.args[1] := s } return r } return lastTok end procedure Re_Default() # # Assign default values to regular expression translation globals, but # only to variables whose values are null. # /Re_WordChars := &letters ++ &digits ++ "_" /Re_Space := ' \t\v\n\r\f' /Re_Digits := &digits /Re_ArbString := ".*" /Re_AnyString := "." return end procedure Re_cset() # # Matches a [...] construct and returns a cset. # local complement,c,e,ch,chars ="[" | fail (complement := ="^" | &null,c := move(1) || tab(find("]")),move(1)) | return &null c ? { e := (="-" | "") while chars := tab(upto('-\\')) do { e ++:= case move(1) of { "-": chars[1:-1] ++ &cset[ord(chars[-1]) + 1:ord(move(1)) + 2] | return &null "\\": case ch := move(1) of { "w": Re_WordChars "W": Re_NonWordChars "s": Re_Space "S": Re_NonSpace "d": Re_Digits "D": Re_NonDigits default: ch } } } e ++:= tab(0) if \complement then e := ~e } e := (\Re_Filter)(e) return cset(e) end procedure Re_string(level) # # Matches a string of non-special characters, returning a string. # local special,s,p static nondigits initial nondigits := ~&digits special := if level = 0 then '\\.+*|[({?' else '\\.+*|[({?)' s := tab(upto(special) | 0) while ="\\" do { p := &pos if tab(any('wWbBsSdD')) | (tab(any('123456789')) & (pos(0) | any(nondigits))) then { tab(p - 1) break } s ||:= move(1) || tab(upto(special) | 0) } if pos(0) & s[-1] == "$" then { move(-1) s[-1] := "" } s := string((\Re_Filter)(s)) return "" ~== s end ##################### Matching Engine Procedures ######################## procedure ReMatch(plist,s,i1,i2) #: position past regular expression matched # # Produce the sequence of positions in s past a string starting at i1 # that matches the pattern plist, but fails if there is no such # position. Similar to match(), but is capable of generating multiple # positions. # local i if type(plist) ~== "list" then plist := RePat(plist) | fail if /s := &subject then /i1 := &pos else /i1 := 1 ; /i2 := 0 i := match("",s,i1,i2) - 1 | fail Re_ParenGroups := [] suspend s[i1:i2] ? (Re_match1(plist,1),i + &pos) end procedure Re_match1(plist,i) # s1,s2,...,sN # # Used privately by ReMatch() to simulate a computed conjunction # expression via recursive generation. # local tok suspend if tok := plist[i] then Re_tok_match(tok,plist,i) & Re_match1(plist,i + 1) else &null end procedure ReFind(plist,s,i1,i2) #: position where regular expression matched # # Produce the sequence of positions in s where strings begin that match # the pattern plist, but fails if there is no such position. Similar # to find(). # local i,p if type(plist) ~== "list" then plist := RePat(plist) | fail if /s := &subject then /i1 := &pos else /i1 := 1 ; /i2 := 0 i := match("",s,i1,i2) - 1 | fail Re_ParenGroups := [] s[i1:i2] ? suspend ( tab(Re_skip(plist)) & p := &pos & Re_match1(plist,1)\1 & i + p) end procedure Re_tok_match(tok,plist,i) # # Match a single token. Can be recursively called by the token # procedure. # local prc,results,result prc := tok.proc if \Re_LeftmostShortest then suspend if prc === Re_Arb then Re_Arb(plist,i) else prc!tok.args else { results := [] every (if prc === Re_Arb then Re_Arb(plist,i) else prc!tok.args) do push(results,[&pos,copy(Re_ParenGroups)]) every result := !results do { Re_ParenGroups := result[2] suspend tab(result[1]) } } end ########## Heuristic Code for Matching Arbitrary Characters ########## procedure Re_skip(plist,i) # s1,s2,...,sN # # Used privately -- match a sequence of strings in s past which a match # of the first pattern element in plist is likely to succeed. This # procedure is used for heuristic performance improvement by ReMatch() # for the ".*" pattern element, and by ReFind(). # local x,s,p,prc,args /i := 1 x := if type(plist) == "list" then plist[i] else plist if /x then suspend find("") else { args := x.args suspend case prc := x.proc of { Re__tabmatch: Re__find!args Re_TabAny: Re__upto!args pos: args[1] Re_WordBoundary | Re_NonWordBoundary: p := &pos & tab(Re_skip(plist,i + 1)) & prc!args & untab(p) Re_MatchParenGroup: if s := \(\Re_ParenGroups)[args[1]] then find(s) else find("") Re_NToMTimes | Re_NOrMoreTimes | Re_NTimes: if args[2] > 0 then Re_skip(args[1]) else find("") Re_OneOrMore | Re_MatchReg: Re_skip(args[1]) Re_Alt: if \Re_Ordered then Re_result_merge{Re_skip(args[1]),Re_skip(args[2])} else Re_skip(args[1 | 2]) default: find("") } } end procedure Re_result_merge(L) # # Programmer-defined control operation to merge the result sequences of # two integer-producing generators. Both generators must produce their # result sequences in numerically increasing order with no duplicates, # and the output sequence will be in increasing order with no # duplicates. # local e1,e2,r1,r2 e1 := L[1] ; e2 := L[2] r1 := @e1 ; r2 := @e2 while \(r1 | r2) do if /r2 | \r1 < r2 then suspend r1 do r1 := @e1 | &null else if /r1 | r1 > r2 then suspend r2 do r2 := @e2 | &null else r2 := @e2 | &null end procedure untab(origPos) # # Converts a string scanning expression that moves the cursor to one # that produces a cursor position and doesn't move the cursor (converts # something like tab(find(x)) to find(x). The template for using this # procedure is # # origPos := &pos ; tab(x) & ... & untab(origPos) # local newPos newPos := &pos tab(origPos) suspend newPos tab(newPos) end ####################### Matching Procedures ####################### procedure Re_Arb(plist,i) # # Match arbitrary characters (.*) # suspend tab(if plist[i + 1] then Re_skip(plist,i + 1) else Re__find("")) end procedure Re_TabAny(C) # # Match a character of a character set ([...],\w,\W,\s,\S,\d,\D) # suspend tab(Re__any(C)) end procedure Re_MatchReg(tokList,groupNbr) # # Match parenthesized group and assign matched string to list Re_ParenGroup # local p,s p := &pos /Re_ParenGroups := [] every Re_match1(tokList,1) do { while *Re_ParenGroups < groupNbr do put(Re_ParenGroups) s := &subject[p:&pos] Re_ParenGroups[groupNbr] := s suspend s } Re_ParenGroups[groupNbr] := &null end procedure Re_WordBoundary(wd,nonwd) # # Match word-boundary (\b) # suspend ((pos(1),any(wd)) | (pos(0),move(-1),tab(any(wd))) | (move(-1), (tab(any(wd)),any(nonwd)) | (tab(any(nonwd)),any(wd))),"") end procedure Re_NonWordBoundary(wd,nonwd) # # Match non-word-boundary (\B) # suspend ((pos(1),any(nonwd)) | (pos(0),move(-1),tab(any(nonwd))) | (move(-1), (tab(any(wd)),any(wd)) | (tab(any(nonwd)),any(nonwd)),"")) end procedure Re_MatchParenGroup(n) # # Match same string matched by previous parenthesized group (\N) # local s suspend if s := \Re_ParenGroups[n] then =s else "" end ################### Control Operation Procedures ################### procedure Re_ArbNo(tok) # # Match any number of times (*) # suspend "" | (Re_tok_match(tok) & Re_ArbNo(tok)) end procedure Re_OneOrMore(tok) # # Match one or more times (+) # suspend Re_tok_match(tok) & Re_ArbNo(tok) end procedure Re_NToMTimes(tok,n,m) # # Match n to m times ({n,m} # suspend Re_NTimes(tok,n) & Re_ArbNo(tok)\(m - n + 1) end procedure Re_NOrMoreTimes(tok,n) # # Match n or more times ({n,}) # suspend Re_NTimes(tok,n) & Re_ArbNo(tok) end procedure Re_NTimes(tok,n) # # Match exactly n times ({n}) # if n > 0 then suspend Re_tok_match(tok) & Re_NTimes(tok,n - 1) else suspend end procedure Re_ZeroOrOneTimes(tok) # # Match zero or one times (?) # suspend "" | Re_tok_match(tok) end procedure Re_Alt(tokList1,tokList2) # # Alternation (|) # suspend Re_match1(tokList1 | tokList2,1) end ################### Case Independence Procedures ################### link noncase global Re__find,Re__match,Re__any,Re__many,Re__upto,Re__tabmatch procedure ReCaseIndependent() Re__find := c_find Re__match := c_match Re__any := c_any Re__many := c_many Re__upto := c_upto Re__tabmatch := Re_c_tabmatch return end procedure ReCaseDependent() Re__find := find Re__match := match Re__any := any Re__many := many Re__upto := upto Re__tabmatch := proc("=",1) return end procedure Re_c_tabmatch(s) suspend tab(c_match(s)) end