############################################################################ # # File: ichartp.icn # # Subject: Procedures for a simple chart parser # # Author: Richard L. Goerwitz # # Date: August 3, 2000 # ############################################################################ # # This file is in the public domain. # ############################################################################ # # Version: 1.11 # ############################################################################ # # General: # # Ichartp implements a simple chart parser - a slow but # easy-to-implement strategy for parsing context free grammars (it # has a cubic worst-case time factor). Chart parsers are flexible # enough to handle a lot of natural language constructs. They also # lack many of the troubles associated with empty and left-recursive # derivations. To obtain a parse, just create a BNF file, obtain a # line of input, and then invoke parse_sentence(sentence, # bnf_filename, start-symbol). Parse_sentence suspends successive # edge structures corresponding to possible parses of the input # sentence. There is a routine called edge_2_tree() that converts # these edges to a more standard form. See the stub main() procedure # for an example of how to make use of all these facilities. # ############################################################################ # # Implementation details: # # The parser itself operates in bottom-up fashion, but it might # just as well have been coded top-down, or for that matter as a # combination bottom-up/top-down parser (chart parsers don't care). # The parser operates in breadth-first fashion, rather than walking # through each alternative until it is exhausted. As a result, there # tends to be a pregnant pause before any results appear, but when # they appear they come out in rapid succession. To use a depth-first # strategy, just change the "put" in "put(ch.active, new_e)" to read # "push." I haven't tried to do this, but it should be that simple # to implement. # BNFs are specified using the same notation used in Griswold & # Griswold, and as described in the IPL program "pargen.icn," with # the following difference: All metacharacters (space, tab, vertical # slash, right/left parends, brackets and angle brackets) are # converted to literals by prepending a backslash. Comments can be # include along with BNFs using the same notation as for Icon code # (i.e. #-sign). # ############################################################################ # # Gotchas: # # Pitfalls to be aware of include things like ::= | ha | # () (a weak attempt at a laugh recognizer). This grammar will # accept "ha," "ha ha," etc. but will suspend an infinite number of # possible parses. The right way to do this sort of thing is ::= # ha | ha, or if you really insist on having the empty string as # a possibility, try things like: # # ::= () | # ::= ha | ha # # Of course, the whole problem of infinite parses can be avoided by # simply invoking the parser in a context where it is not going to # be resumed, or else one in which it will be resumed a finite number # of times. # ############################################################################ # # Motivation: # # I was reading Byte Magazine (vol. 17:2 [February, 1992]), and # ran into an article entitled "A Natural Solution" (pages 237-244) # in which a standard chart parser was described in terms of its C++ # implementation. The author remarked at how his optimizations made # it possible to parse a 14-word sentence in only 32 seconds (versus # 146 for a straight Gazdar-Mellish LISP chart parser). 32 seconds # struck me as hardly anything to write home about, so I coded up a # quick system in Icon to see how it compared. This library is the # result. # I'm quite sure that this code could be very much improved upon. # As it stands, its performance seems as good as the C++ parser in # BYTE, if not better. It's hard to tell, though, seeing as I have # no idea what hardware the guy was using. I'd guess a 386 running # DOS. On a 386 running Xenix the Icon version beats the BYTE times # by a factor of about four. The Icon compiler creates an executable # that (in the above environment) parses 14-15 word sentences in # anywhere from 6 to 8 seconds. Once the BNF file is read, it does # short sentences in a second or two. If I get around to writing it, # I'll probably use the code here as the basic parsing engine for an # adventure game my son wants me to write. # ############################################################################ # # Links: trees, rewrap, scan, strip, stripcom, strings # ############################################################################ # # Requires: co-expressions # ############################################################################ # # Here's a sample BNF file (taken, modified, from the BYTE # Magazine article mentioned above). Note again the conventions a) # that nonterminals be enclosed in angle brackets & b) that overlong # lines be continued by terminating the preceding line with a # backslash. Although not illustrated below, the metacharacters <, # >, (, ), and | can all be escaped (i.e. can all have their special # meaning neutralized) with a backslash (e.g. \<). Comments can also # be included using the Icon #-notation. Empty symbols are illegal, # so if you want to specify a zero-derivation, use "()." There is an # example of this usage below. # # ::= | # ::= | ( () | ) | \ # ( | | | ) # ::= ( | | | ) | \ # | | | \ # # ::=

( | ) | # ::= # ::= and # ::= the | a | his | her # ::= her | he | they # ::= nurse | nurses | book | books | travel | arrow | arrows | \ # fortune | fortunes | report # ::= outrageous | silly | blue | green | heavy | white | red | \ # black | yellow # ::= travel | travels | report | see | suffer # ::= hear | see | suffer #

::= on | of # ::= that # ############################################################################ # # Addendum: # # Sometimes, when writing BNFs, one finds oneself repeatedly # writing the same things. In efforts to help eliminate the need for # doing this, I've written a simple macro facility. It involves one # reserved word: "define." Just make sure it begins a line. It # takes two arguments. The first is the macro. The second is its # expansion. The first argument must not contain any spaces. The # second, however, may. Here's an example: # # define ( | \ # | \ # \ # ) # ############################################################################ link trees link scan link rewrap link strip link stripcom link strings record stats(edge_list, lhs_table, term_set) record chart(inactive, active) # inactive - set; active - list record retval(no, item) record edge(LHS, RHS, LEN, DONE, BEG, END, SEEN) record short_edge(LHS, RHS) # # For debugging only. # procedure main(a) local res, filename, line # &trace := -1 filename := \a[1] | "bnfs.byte" while line := read(&input) do { res := &null every res := parse_sentence(line, filename, "S") do { if res.no = 0 then write(stree(edge2tree(res.item))) # write(ximage(res.item)) else if res.no = 1 then { write("hmmm") write(stree(edge2tree(res.item))) } } /res & write("can't parse ",line) } end # # parse_sentence: string x string -> edge records # (s, filename) -> Es # where s is a chunk of text presumed to constitute a sentence # where filename is the name of a grammar file containing BNFs # where Es are edge records containing possible parses of s # procedure parse_sentence(s, filename, start_symbol) local file, e, i, elist, ltbl, tset, ch, tokens, st, memb, new_e, token_set, none_found, active_modified static master, old_filename initial master := table() # # Initialize and store stats for filename (if not already stored). # if not (filename == \old_filename) then { file := open(filename, "r") | p_err(filename, 7) # # Read BNFs from file; turn them into edge structs, and # store them all in a list; insert terminal symbols into a set. # elist := list(); ltbl := table(); tset := set() every e := bnf_file_2_edges(file) do { put(elist, e) # main edge list (active) (/ltbl[e.LHS] := set([e])) | insert(ltbl[e.LHS], e) # index LHSs every i := 1 to e.LEN do # LEN holds length of e.RHS if /e.RHS[i].RHS then # RHS for terminals is null insert(tset, e.RHS[i].LHS) } insert(master, filename, stats(elist, ltbl, tset)) old_filename := filename close(file) } elist := fullcopy(master[filename].edge_list) ltbl := fullcopy(master[filename].lhs_table) tset := master[filename].term_set # # Make edge list into the active section of chart; tokenize the # sentence s & check for unrecognized terminals. # ch := chart(set(), elist) tokens := tokenize(s) # # Begin parse by entering all tokens in s into the inactive set # in the chart as edges with no RHS (a NULL RHS is characteristic # of all terminals). # token_set := set(tokens) every i := 1 to *tokens do { # Flag words not in the grammar as errors. if not member(tset, tokens[i]) then suspend retval(1, tokens[i]) # Now, give us an inactive edge corresponding to word i. insert(ch.inactive, e := edge(tokens[i], &null, 1, 1, i, i+1)) # Insert word i into the LHS table. (/ltbl[tokens[i]] := set([e])) | insert(ltbl[tokens[i]], e) # Watch out for those empty RHSs. insert(ch.inactive, e := edge("", &null, 1, 1, i, i)) (/ltbl[""] := set([e])) | insert(ltbl[""], e) } *tokens = 0 & i := 0 insert(ch.inactive, e := edge("", &null, 1, 1, i+1, i+1)) (/ltbl[""] := set([e])) | insert(ltbl[""], e) # # Until no new active edges can be built, keep ploughing through # the active edge list, trying to match unconfirmed members of their # RHSs up with inactive edges. # until \none_found do { # write(ximage(ch)) none_found := 1 every e := !ch.active do { active_modified := &null # keep track of inactive edges we've already tried /e.SEEN := set() # # e.RHS[e.DONE+1] is the first unconfirmed category in the # RHS of e; ltbl[e.RHS[e.DONE+1].LHS] are all edges having # as their LHS the LHS of the first unconfirmed category in # e's RHS; we simply intersect this set with the inactives, # and then subtract out those we've seen before in connec- # tion with this edge - # if *(st := \ltbl[e.RHS[e.DONE+1].LHS] ** ch.inactive -- e.SEEN) > 0 then { # record all the inactive edges being looked at as seen e.SEEN ++:= st every memb := !st do { # make sure this inactive edge starts where the # last confirmed edge in e.RHS ends! if memb.BEG ~= \e.RHS[e.DONE].END then next # set none_found to indicate we've created a new edge else none_found := &null # create a new edge, having the LHS of e, the RHS of e, # the start point of e, the end point of st, and one more # confirmed RHS members than e new_e := edge(e.LHS, fullcopy(e.RHS), e.LEN, e.DONE+1, e.BEG, memb.END) new_e.RHS[new_e.DONE] := memb /new_e.BEG := memb.BEG if new_e.LEN = new_e.DONE then { # it's inactive insert(ch.inactive, new_e) insert(ltbl[e.LHS], new_e) if new_e.BEG = 1 & new_e.END = (*tokens+1) then { if new_e.LHS == start_symbol # complete parse then suspend retval(0, new_e) } } else { put(ch.active, new_e) # it's active active_modified := 1 } } } # restart if the ch.active list has been modified if \active_modified then break next } } end # # tokenize: break up a sentence into constituent words, using spaces, # tabs, and other punctuation as separators (we'll need to # change this a bit later on to cover apostrophed words) # procedure tokenize(s) local l, word l := list() s ? { while tab(upto(&letters)) do put(l, map(tab(many(&letters)))) } return l end # # edge2tree: edge -> tree # e -> t # # where e is an edge structure (active or inactive; both are okay) # where t is a tree like what's described in Ralph Griswold's # structs library (IPL); I don't know about the 2nd ed. of # Griswold & Griswold, but the structure is described in the 1st # ed. in section 16.1 # # fails if, for some reason, the conversion can't be made (e.g. the # edge structure has been screwed around with in some way) # procedure edge2tree(e) local memb, t t := [e.LHS] \e.RHS | (return t) # a terminal type(e) == "edge" | (return put(t, [])) # An incomplete edge every memb := !e.RHS do # has daughters. put(t, edge2tree(memb)) return t end # # bnf_file_2_edges: concatenate backslash-final lines & parse # procedure bnf_file_2_edges(f) local getline, line, macro_list, old, new, i macro_list := list() getline := create stripcom(!f) while line := @getline do { while line ?:= 1(tab(-2) || tab(slshupto('\\')), pos(-1)) || @getline line ? { if ="define" then { tab(many('\t ')) old := tab(slshupto('\t ')) | stop("bnf_file_2_edges", 7, tab(0)) tab(many('\t ')) new := tab(0) (!macro_list)[1] == old & stop("bnf_file_2_edges", 8, old) put(macro_list, [old, new]) next # go back to main loop } else { every i := 1 to *macro_list do # Replace is in the IPL (strings.icn). line := replace(line, macro_list[i][1], macro_list[i][2]) suspend bnf_2_edges(line) } } } end # # bnf_2_edges: string -> edge records # s -> Es (a generator) # where s is a CFPSG rule in BNF form # where Es are edges # procedure bnf_2_edges(s) local tmp, RHS, LHS # # Break BNF-style CFPSG rule into LHS and RHS. If there is more # than one RHS (a la the | alternation op), suspend multiple re- # sults. # s ? { # tab upto the ::= sign tmp := (tab(slshupto(':')) || ="::=") | p_err(s, 1) # strip non-backslashed spaces, and extract LHS symbol stripspaces(tmp) ? { LHS := 1(tab(slashbal(':', '<', '>')), ="::=") | p_err(s, 1) LHS ?:= strip(2(="<", tab(-1), =">"), '\\') | p_err(s, 2) LHS == "" & p_err(s, 6) } every RHS := do_slash(tab(0) \ 1) do { RHS := string_2_list(RHS) suspend edge(LHS, RHS, *RHS, 0, &null, &null) } } end # # string_2_list: string -> list # s -> L # where L is a list of partially constructed (short) edges, having # only LHS and RHS; in the case of nonterminals, the RHS is set # to 1, while for terminals the RHS is null (and remains that way # throughout the parse) # procedure string_2_list(s) local tmp, RHS_list, LHS (s || "\x00") ? { tab(many(' \t')) pos(-1) & (return [short_edge("", &null)]) RHS_list := list() repeat { tab(many(' \t')) pos(-1) & break if match("<") then { tmp := ("" ~== tab(slashbal(&cset, '<', '>'))) | p_err(s, 4) LHS := stripspaces(tmp) LHS ?:= strip(2(="<", tab(-1), =">"), '\\') | p_err(s, 4) LHS == "" & p_err(s, 10) put(RHS_list, short_edge(LHS, 1)) } else { LHS := stripspaces(tab(slshupto(' <') | -1)) slshupto('>', LHS) & p_err(s, 5) put(RHS_list, short_edge(strip(LHS, '\\'), &null)) } } } return RHS_list end # # fullcopy: make full recursive copy of object # procedure fullcopy(obj) local retval, i, k case type(obj) of { "co-expression" : return obj "cset" : return obj "file" : return obj "integer" : return obj "list" : { retval := list(*obj) every i := 1 to *obj do retval[i] := fullcopy(obj[i]) return retval } "null" : return &null "procedure" : return obj "real" : return obj "set" : { retval := set() every insert(retval, fullcopy(!obj)) return retval } "string" : return obj "table" : { retval := table(obj[[]]) every k := key(obj) do insert(retval, fullcopy(k), fullcopy(obj[k])) return retval } # probably a record; if not, we're dealing with a new # version of Icon or a nonstandard implementation, and # we're screwed default : { retval := copy(obj) every i := 1 to *obj do retval[i] := fullcopy(obj[i]) return retval } } end # # do_slash: string -> string(s) # Given a|b suspend a then b. Used in conjunction with do_parends(). # procedure do_slash(s) local chunk s ? { while chunk := tab(slashbal('|', '(', ')')) do { suspend do_parends(chunk) move(1) } suspend do_parends(tab(0)) } end # # do_parends: string -> string(s) # Given a(b)c suspend abc; given a(b|c)d suspend abd and acd, etc. # Used in conjuction with do_slash(). # procedure do_parends(s) local chunk, i, j s ? { if not (i := slshupto('(')) then { chunk := tab(0) slshupto(')') & p_err(s, 8) suspend chunk } else { j := i + slashbal(')', '(', ')', s[i+1:0]) | p_err(s, 9) suspend tab(i) || (move(1), do_slash(tab(j))) || (move(1), do_parends(tab(0))) } } end # # p_err: print error message to stderr & abort # procedure p_err(s, n) local i, msg static errlist initial { errlist := [[1, "malformed LHS"], [2, "nonterminal lacks proper <> enclosure"], [3, "missing left angle bracket"], [4, "unmatched left angle bracket"], [5, "unmatched right angle bracket"], [6, "empty symbol in LHS"], [7, "unable to open file"], [8, "unmatched right parenthesis"], [9, "unmatched left parenthesis"], [10, "empty symbol in RHS"] ] } every i := 1 to *errlist do if errlist[i][1] = n then msg := errlist[i][2] writes(&errout, "error ", n, " (", msg, ") in \n") every write("\t", rewrap(s) | rewrap()) exit(n) end # # Remove non-backslashed spaces and tabs. # procedure stripspaces(s) local s2 s2 := "" s ? { while s2 ||:= tab(slshupto(' \t')) do tab(many(' \t')) s2 ||:= tab(0) } return s2 end