############################################################################ # # File: rsg.icn # # Subject: Program to generate randomly selected sentences # # Author: Ralph E. Griswold # # Date: March 26, 2002 # ############################################################################ # # This file is in the public domain. # ############################################################################ # # This program generates randomly selected strings (``sen- # tences'') from a grammar specified by the user. Grammars are # basically context-free and resemble BNF in form, although there # are a number of extensions. # ############################################################################ # # The program works interactively, allowing the user to build, # test, modify, and save grammars. Input to rsg consists of various # kinds of specifications, which can be intermixed: # # Productions define nonterminal symbols in a syntax similar to # the rewriting rules of BNF with various alternatives consisting # of the concatenation of nonterminal and terminal symbols. Gen- # eration specifications cause the generation of a specified number # of sentences from the language defined by a given nonterminal # symbol. Grammar output specifications cause the definition of a # specified nonterminal or the entire current grammar to be written # to a given file. Source specifications cause subsequent input to # be read from a specified file. # # In addition, any line beginning with # is considered to be a # comment, while any line beginning with = causes the rest of that # line to be used subsequently as a prompt to the user whenever rsg # is ready for input (there normally is no prompt). A line consist- # ing of a single = stops prompting. # # Productions: Examples of productions are: # # ::=|+ # ::=|* # ::=x|y|z|() # # Productions may occur in any order. The definition for a nonter- # minal symbol can be changed by specifying a new production for # it. # # There are a number of special devices to facilitate the defin- # ition of grammars, including eight predefined, built-in nontermi- # nal symbols: # symbol definition # < # > # | # newline # <> empty string # <&lcase> any single lowercase letter # <&ucase> any single uppercase letter # <&digit> any single digit # # In addition, if the string between a < and a > begins and ends # with a single quotation mark, it stands for any single character # between the quotation marks. For example, # # <'xyz'> # # is equivalent to # # x|y|z # # Generation Specifications: A generation specification consists of # a nonterminal symbol followed by a nonnegative integer. An exam- # ple is # # 10 # # which specifies the generation of 10 s. If the integer is # omitted, it is assumed to be 1. Generated sentences are written # to standard output. # # Grammar Output Specifications: A grammar output specification # consists of a nonterminal symbol, followed by ->, followed by a # file name. Such a specification causes the current definition of # the nonterminal symbol to be written to the given file. If the # file is omitted, standard output is assumed. If the nonterminal # symbol is omitted, the entire grammar is written out. Thus, # # -> # # causes the entire grammar to be written to standard output. # # Source Specifications: A source specification consists of @ fol- # lowed by a file name. Subsequent input is read from that file. # When an end of file is encountered, input reverts to the previous # file. Input files can be nested. # # Options: The following options are available: # # -s n Set the seed for random generation to n. # # -r In the absence of -s, set the seed to 0 for repeatable # results. Otherwise the seed is set to a different value # for each run (as far as this is possible). -r is equivalent # to -s 0. # # -l n Terminate generation if the number of symbols remaining # to be processed exceeds n. The default is limit is 1000. # # -t Trace the generation of sentences. Trace output goes to # standard error output. # # Diagnostics: Syntactically erroneous input lines are noted but # are otherwise ignored. Specifications for a file that cannot be # opened are noted and treated as erroneous. # # If an undefined nonterminal symbol is encountered during gen- # eration, an error message that identifies the undefined symbol is # produced, followed by the partial sentence generated to that # point. Exceeding the limit of symbols remaining to be generated # as specified by the -l option is handled similarly. # # Caveats: Generation may fail to terminate because of a loop in # the rewriting rules or, more seriously, because of the progres- # sive accumulation of nonterminal symbols. The latter problem can # be identified by using the -t option and controlled by using the # -l option. The problem often can be circumvented by duplicating # alternatives that lead to fewer rather than more nonterminal sym- # bols. For example, changing # # ::=|* # # to # # ::=||* # # increases the probability of selecting from 1/2 to 2/3. # # There are many possible extensions to the program. One of the # most useful would be a way to specify the probability of select- # ing an alternative. # ############################################################################ # # Links: options, random # ############################################################################ link options link random global defs, ifile, in, limit, prompt, tswitch record nonterm(name) record charset(chars) procedure main(args) local line, plist, s, opts # procedures to try on input lines plist := [define,generate,grammar,source,comment,prompter,error] defs := table() # table of definitions defs["lb"] := [["<"]] # built-in definitions defs["rb"] := [[">"]] defs["vb"] := [["|"]] defs["nl"] := [["\n"]] defs[""] := [[""]] defs["&lcase"] := [[charset(&lcase)]] defs["&ucase"] := [[charset(&ucase)]] defs["&digit"] := [[charset(&digits)]] opts := options(args,"tl+s+r") limit := \opts["l"] | 1000 tswitch := \opts["t"] &random := \opts["s"] if /opts["s"] & /opts["r"] then randomize() ifile := [&input] # stack of input files prompt := "" while in := pop(ifile) do { # process all files repeat { if *prompt ~= 0 then writes(prompt) line := read(in) | break while line[-1] == "\\" do line := line[1:-1] || read(in) | break (!plist)(line) } close(in) } end # process alternatives # procedure alts(defn) local alist alist := [] defn ? while put(alist,syms(tab(upto('|') | 0))) do move(1) | break return alist end # look for comment # procedure comment(line) if line[1] == "#" then return end # look for definition # procedure define(line) return line ? defs[(="<",tab(find(">::=")))] := (move(4),alts(tab(0))) end # define nonterminal # procedure defnon(sym) local chars, name if sym ? { ="'" & chars := cset(tab(-1)) & ="'" } then return charset(chars) else return nonterm(sym) end # note erroneous input line # procedure error(line) write("*** erroneous line: ",line) return end # generate sentences # procedure gener(goal) local pending, symbol pending := [nonterm(goal)] while symbol := get(pending) do { if \tswitch then write(&errout,symimage(symbol),listimage(pending)) case type(symbol) of { "string": writes(symbol) "charset": writes(?symbol.chars) "nonterm": { pending := ?\defs[symbol.name] ||| pending | { write(&errout,"*** undefined nonterminal: <",symbol.name,">") break } if *pending > \limit then { write(&errout,"*** excessive symbols remaining") break } } } } write() end # look for generation specification # procedure generate(line) local goal, count if line ? { ="<" & goal := tab(upto('>')) \ 1 & move(1) & count := (pos(0) & 1) | integer(tab(0)) } then { every 1 to count do gener(goal) return } else fail end # get right hand side of production # procedure getrhs(a) local rhs rhs := "" every rhs ||:= listimage(!a) || "|" return rhs[1:-1] end # look for request to write out grammar # procedure grammar(line) local file, out, name if line ? { name := tab(find("->")) & move(2) & file := tab(0) & out := if *file = 0 then &output else { open(file,"w") | { write(&errout,"*** cannot open ",file) fail } } } then { (*name = 0) | (name[1] == "<" & name[-1] == ">") | fail pwrite(name,out) if *file ~= 0 then close(out) return } else fail end # produce image of list of grammar symbols # procedure listimage(a) local s, x s := "" every x := !a do s ||:= symimage(x) return s end # look for new prompt symbol # procedure prompter(line) if line[1] == "=" then { prompt := line[2:0] return } end # write out grammar # procedure pwrite(name,ofile) local nt, a static builtin initial builtin := ["lb","rb","vb","nl","","&lcase","&ucase","&digit"] if *name = 0 then { a := sort(defs,3) while nt := get(a) do { if nt == !builtin then { get(a) next } write(ofile,"<",nt,">::=",getrhs(get(a))) } } else write(ofile,name,"::=",getrhs(\defs[name[2:-1]])) | write("*** undefined nonterminal: ",name) end # look for file with input # procedure source(line) local file, new return line ? { if ="@" then { new := open(file := tab(0)) | { write(&errout,"*** cannot open ",file) fail } push(ifile,in) & in := new return } } end # produce string image of grammar symbol # procedure symimage(x) return case type(x) of { "string": x "nonterm": "<" || x.name || ">" "charset": "<'" || x.chars || "'>" } end # process the symbols in an alternative # procedure syms(alt) local slist static nonbrack initial nonbrack := ~'<' slist := [] alt ? while put(slist,tab(many(nonbrack)) | defnon(2(="<",tab(upto('>')),move(1)))) return slist end