############################################################################ # # File: itokens.icn # # Subject: Procedures for tokenizing Icon code # # Author: Richard L. Goerwitz # # Date: March 3, 1996 # ############################################################################ # # This file is in the public domain. # ############################################################################ # # Version: 1.11 # ############################################################################ # # This file contains itokens() - a utility for breaking Icon source # files up into individual tokens. This is the sort of routine one # needs to have around when implementing things like pretty printers, # preprocessors, code obfuscators, etc. It would also be useful for # implementing cut-down implementations of Icon written in Icon - the # sort of thing one might use in an interactive tutorial. # # Itokens(f, x) takes, as its first argument, f, an open file, and # suspends successive TOK records. TOK records contain two fields. # The first field, sym, contains a string that represents the name of # the next token (e.g. "CSET", "STRING", etc.). The second field, # str, gives that token's literal value. E.g. the TOK for a literal # semicolon is TOK("SEMICOL", ";"). For a mandatory newline, itokens # would suspend TOK("SEMICOL", "\n"). # # Unlike Icon's own tokenizer, itokens() does not return an EOFX # token on end-of-file, but rather simply fails. It also can be # instructed to return syntactically meaningless newlines by passing # it a nonnull second argument (e.g. itokens(infile, 1)). These # meaningless newlines are returned as TOK records with a null sym # field (i.e. TOK(&null, "\n")). # # NOTE WELL: If new reserved words or operators are added to a given # implementation, the tables below will have to be altered. Note # also that &keywords should be implemented on the syntactic level - # not on the lexical one. As a result, a keyword like &features will # be suspended as TOK("CONJUNC", "&") and TOK("IDENT", "features"). # ############################################################################ # # Links: scan # ############################################################################ # # Requires: coexpressions # ############################################################################ link scan global next_c, line_number record TOK(sym, str) # # main: an Icon source code uglifier # # Stub main for testing; uncomment & compile. The resulting # executable will act as an Icon file compressor, taking the # standard input and outputting Icon code stripped of all # unnecessary whitespace. Guaranteed to make the code a visual # mess :-). # #procedure main() # # local separator, T # separator := "" # every T := itokens(&input) do { # if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT" # then writes(separator) # if T.sym == "SEMICOL" then writes(";") else writes(T.str) # if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT" # then separator := " " else separator := "" # } # #end # # itokens: file x anything -> TOK records (a generator) # (stream, nostrip) -> Rs # # Where stream is an open file, anything is any object (it only # matters whether it is null or not), and Rs are TOK records. # Note that itokens strips out useless newlines. If the second # argument is nonnull, itokens does not strip out superfluous # newlines. It may be useful to keep them when the original line # structure of the input file must be maintained. # procedure itokens(stream, nostrip) local T, last_token # initialize to some meaningless value last_token := TOK() every T := \iparse_tokens(stream) do { if \T.sym then { if T.sym == "EOFX" then fail else { # # If the last token was a semicolon, then interpret # all ambiguously unary/binary sequences like "**" as # beginners (** could be two unary stars or the [c]set # intersection operator). # if \last_token.sym == "SEMICOL" then suspend last_token := expand_fake_beginner(T) else suspend last_token := T } } else { if \nostrip then suspend last_token := T } } end # # expand_fake_beginner: TOK record -> TOK records # # Some "beginner" tokens aren't really beginners. They are token # sequences that could be either a single binary operator or a # series of unary operators. The tokenizer's job is just to snap # up as many characters as could logically constitute an operator. # Here is where we decide whether to break the sequence up into # more than one op or not. # procedure expand_fake_beginner(next_token) static exptbl initial { exptbl := table() insert(exptbl, "CONCAT", [TOK("BAR", "|"), TOK("BAR", "|")]) insert(exptbl, "DIFF", [TOK("MINUS", "-"), TOK("MINUS", "-")]) insert(exptbl, "EQUIV", [TOK("NUMEQ", "="), TOK("NUMEQ", "="), TOK("NUMEQ", "=")]) insert(exptbl, "INTER", [TOK("STAR", "*"), TOK("STAR", "*")]) insert(exptbl, "LCONCAT", [TOK("BAR", "|"), TOK("BAR", "|"), TOK("BAR", "|")]) insert(exptbl, "LEXEQ", [TOK("NUMEQ", "="), TOK("NUMEQ", "=")]) insert(exptbl, "LEXNE", [TOK("TILDE", "~"), TOK("NUMEQ", "="), TOK("NUMEQ", "=")]) insert(exptbl, "NOTEQUIV",[TOK("TILDE", "~"), TOK("NUMEQ","="), TOK("NUMEQ", "="), TOK("NUMEQ", "=")]) insert(exptbl, "NUMNE", [TOK("TILDE", "~"), TOK("NUMEQ","=")]) insert(exptbl, "UNION", [TOK("PLUS", "+"), TOK("PLUS", "+")]) } if \exptbl[next_token.sym] then suspend !exptbl[next_token.sym] else return next_token end # # iparse_tokens: file -> TOK records (a generator) # (stream) -> tokens # # Where file is an open input stream, and tokens are TOK records # holding both the token type and actual token text. # # TOK records contain two parts, a preterminal symbol (the first # "sym" field), and the actual text of the token ("str"). The # parser only pays attention to the sym field, although the # strings themselves get pushed onto the value stack. # # Note the following kludge: Unlike real Icon tokenizers, this # procedure returns syntactially meaningless newlines as TOK # records with a null sym field. Normally they would be ignored. # I wanted to return them so they could be printed on the output # stream, thus preserving the line structure of the original # file, and making later diagnostic messages more usable. # procedure iparse_tokens(stream, getchar) local elem, whitespace, token, last_token, primitives, reserveds static be_tbl, reserved_tbl, operators initial { # Primitive Tokens # primitives := [ ["identifier", "IDENT", "be"], ["integer-literal", "INTLIT", "be"], ["real-literal", "REALLIT", "be"], ["string-literal", "STRINGLIT", "be"], ["cset-literal", "CSETLIT", "be"], ["end-of-file", "EOFX", "" ]] # Reserved Words # reserveds := [ ["break", "BREAK", "be"], ["by", "BY", "" ], ["case", "CASE", "b" ], ["create", "CREATE", "b" ], ["default", "DEFAULT", "b" ], ["do", "DO", "" ], ["else", "ELSE", "" ], ["end", "END", "b" ], ["every", "EVERY", "b" ], ["fail", "FAIL", "be"], ["global", "GLOBAL", "" ], ["if", "IF", "b" ], ["initial", "INITIAL", "b" ], ["invocable", "INVOCABLE", "" ], ["link", "LINK", "" ], ["local", "LOCAL", "b" ], ["next", "NEXT", "be"], ["not", "NOT", "b" ], ["of", "OF", "" ], ["procedure", "PROCEDURE", "" ], ["record", "RECORD", "" ], ["repeat", "REPEAT", "b" ], ["return", "RETURN", "be"], ["static", "STATIC", "b" ], ["suspend", "SUSPEND", "be"], ["then", "THEN", "" ], ["to", "TO", "" ], ["until", "UNTIL", "b" ], ["while", "WHILE", "b" ]] # Operators # operators := [ [":=", "ASSIGN", "" ], ["@", "AT", "b" ], ["@:=", "AUGACT", "" ], ["&:=", "AUGAND", "" ], ["=:=", "AUGEQ", "" ], ["===:=", "AUGEQV", "" ], [">=:=", "AUGGE", "" ], [">:=", "AUGGT", "" ], ["<=:=", "AUGLE", "" ], ["<:=", "AUGLT", "" ], ["~=:=", "AUGNE", "" ], ["~===:=", "AUGNEQV", "" ], ["==:=", "AUGSEQ", "" ], [">>=:=", "AUGSGE", "" ], [">>:=", "AUGSGT", "" ], ["<<=:=", "AUGSLE", "" ], ["<<:=", "AUGSLT", "" ], ["~==:=", "AUGSNE", "" ], ["\\", "BACKSLASH", "b" ], ["!", "BANG", "b" ], ["|", "BAR", "b" ], ["^", "CARET", "b" ], ["^:=", "CARETASGN", "b" ], [":", "COLON", "" ], [",", "COMMA", "" ], ["||", "CONCAT", "b" ], ["||:=", "CONCATASGN","" ], ["&", "CONJUNC", "b" ], [".", "DOT", "b" ], ["--", "DIFF", "b" ], ["--:=", "DIFFASGN", "" ], ["===", "EQUIV", "b" ], ["**", "INTER", "b" ], ["**:=", "INTERASGN", "" ], ["{", "LBRACE", "b" ], ["[", "LBRACK", "b" ], ["|||", "LCONCAT", "b" ], ["|||:=", "LCONCATASGN","" ], ["==", "LEXEQ", "b" ], [">>=", "LEXGE", "" ], [">>", "LEXGT", "" ], ["<<=", "LEXLE", "" ], ["<<", "LEXLT", "" ], ["~==", "LEXNE", "b" ], ["(", "LPAREN", "b" ], ["-:", "MCOLON", "" ], ["-", "MINUS", "b" ], ["-:=", "MINUSASGN", "" ], ["%", "MOD", "" ], ["%:=", "MODASGN", "" ], ["~===", "NOTEQUIV", "b" ], ["=", "NUMEQ", "b" ], [">=", "NUMGE", "" ], [">", "NUMGT", "" ], ["<=", "NUMLE", "" ], ["<", "NUMLT", "" ], ["~=", "NUMNE", "b" ], ["+:", "PCOLON", "" ], ["+", "PLUS", "b" ], ["+:=", "PLUSASGN", "" ], ["?", "QMARK", "b" ], ["<-", "REVASSIGN", "" ], ["<->", "REVSWAP", "" ], ["}", "RBRACE", "e" ], ["]", "RBRACK", "e" ], [")", "RPAREN", "e" ], [";", "SEMICOL", "" ], ["?:=", "SCANASGN", "" ], ["/", "SLASH", "b" ], ["/:=", "SLASHASGN", "" ], ["*", "STAR", "b" ], ["*:=", "STARASGN", "" ], [":=:", "SWAP", "" ], ["~", "TILDE", "b" ], ["++", "UNION", "b" ], ["++:=", "UNIONASGN", "" ], ["$(", "LBRACE", "b" ], ["$)", "RBRACE", "e" ], ["$<", "LBRACK", "b" ], ["$>", "RBRACK", "e" ], ["$", "RHSARG", "b" ], ["%$(", "BEGGLOB", "b" ], ["%$)", "ENDGLOB", "e" ], ["%{", "BEGGLOB", "b" ], ["%}", "ENDGLOB", "e" ], ["%%", "NEWSECT", "be"]] # static be_tbl, reserved_tbl reserved_tbl := table() every elem := !reserveds do insert(reserved_tbl, elem[1], elem[2]) be_tbl := table() every elem := !primitives | !reserveds | !operators do { insert(be_tbl, elem[2], elem[3]) } } /getchar := create { line_number := 0 ! ( 1(!stream, line_number +:=1) || "\n" ) } whitespace := ' \t' /next_c := @getchar | { if \stream then return TOK("EOFX") else fail } repeat { case next_c of { "." : { # Could be a real literal *or* a dot operator. Check # following character to see if it's a digit. If so, # it's a real literal. We can only get away with # doing the dot here because it is not a substring of # any longer identifier. If this gets changed, we'll # have to move this code into do_operator(). # last_token := do_dot(getchar) suspend last_token # write(&errout, "next_c == ", image(next_c)) next } "\n" : { # If do_newline fails, it means we're at the end of # the input stream, and we should break out of the # repeat loop. # every last_token := do_newline(getchar, last_token, be_tbl) do suspend last_token if next_c === &null then break next } "\#" : { # Just a comment. Strip it by reading every character # up to the next newline. The global var next_c # should *always* == "\n" when this is done. # do_number_sign(getchar) # write(&errout, "next_c == ", image(next_c)) next } "\"" : { # Suspend as STRINGLIT everything from here up to the # next non-backslashed quotation mark, inclusive # (accounting for the _ line-continuation convention). # last_token := do_quotation_mark(getchar) suspend last_token # write(&errout, "next_c == ", image(next_c)) next } "'" : { # Suspend as CSETLIT everything from here up to the # next non-backslashed apostrophe, inclusive. # last_token := do_apostrophe(getchar) suspend last_token # write(&errout, "next_c == ", image(next_c)) next } &null : stop("iparse_tokens (lexer): unexpected EOF") default : { # If we get to here, we have either whitespace, an # integer or real literal, an identifier or reserved # word (both get handled by do_identifier), or an # operator. The question of which we have can be # determined by checking the first character. # if any(whitespace, next_c) then { # Like all of the TOK forming procedures, # do_whitespace resets next_c. do_whitespace(getchar, whitespace) # don't suspend any tokens next } if any(&digits, next_c) then { last_token := do_digits(getchar) suspend last_token next } if any(&letters ++ '_', next_c) then { last_token := do_identifier(getchar, reserved_tbl) suspend last_token next } # write(&errout, "it's an operator") last_token := do_operator(getchar, operators) suspend last_token next } } } # If stream argument is nonnull, then we are in the top-level # iparse_tokens(). If not, then we are in a recursive call, and # we should not emit all this end-of-file crap. # if \stream then { return TOK("EOFX") } else fail end # # do_dot: coexpression -> TOK record # getchar -> t # # Where getchar is the coexpression that produces the next # character from the input stream and t is a token record whose # sym field contains either "REALLIT" or "DOT". Essentially, # do_dot checks the next char on the input stream to see if it's # an integer. Since the preceding char was a dot, an integer # tips us off that we have a real literal. Otherwise, it's just # a dot operator. Note that do_dot resets next_c for the next # cycle through the main case loop in the calling procedure. # procedure do_dot(getchar) local token # global next_c # write(&errout, "it's a dot") # If dot's followed by a digit, then we have a real literal. # if any(&digits, next_c := @getchar) then { # write(&errout, "dot -> it's a real literal") token := "." || next_c while any(&digits, next_c := @getchar) do token ||:= next_c if token ||:= (next_c == ("e"|"E")) then { while (next_c := @getchar) == "0" while any(&digits, next_c) do { token ||:= next_c next_c = @getchar } } return TOK("REALLIT", token) } # Dot not followed by an integer; so we just have a dot operator, # and not a real literal. # # write(&errout, "dot -> just a plain dot") return TOK("DOT", ".") end # # do_newline: coexpression x TOK record x table -> TOK records # (getchar, last_token, be_tbl) -> Ts (a generator) # # Where getchar is the coexpression that returns the next # character from the input stream, last_token is the last TOK # record suspended by the calling procedure, be_tbl is a table of # tokens and their "beginner/ender" status, and Ts are TOK # records. Note that do_newline resets next_c. Do_newline is a # mess. What it does is check the last token suspended by the # calling procedure to see if it was a beginner or ender. It # then gets the next token by calling iparse_tokens again. If # the next token is a beginner and the last token is an ender, # then we have to suspend a SEMICOL token. In either event, both # the last and next token are suspended. # procedure do_newline(getchar, last_token, be_tbl) local next_token # global next_c # write(&errout, "it's a newline") # Go past any additional newlines. # while next_c == "\n" do { # NL can be the last char in the getchar stream; if it *is*, # then signal that it's time to break out of the repeat loop # in the calling procedure. # next_c := @getchar | { next_c := &null fail } suspend TOK(&null, next_c == "\n") } # If there was a last token (i.e. if a newline wasn't the first # character of significance in the input stream), then check to # see if it was an ender. If so, then check to see if the next # token is a beginner. If so, then suspend a TOK("SEMICOL") # record before suspending the next token. # if find("e", be_tbl[(\last_token).sym]) then { # write(&errout, "calling iparse_tokens via do_newline") # &trace := -1 # First arg to iparse_tokens can be null here. \ (next_token := iparse_tokens(&null, getchar)).sym if \next_token then { # write(&errout, "call of iparse_tokens via do_newline yields ", # ximage(next_token)) if find("b", be_tbl[next_token.sym]) then suspend TOK("SEMICOL", "\n") # # See below. If this were like the real Icon parser, # the following line would be commented out. # else suspend TOK(&null, "\n") return next_token } else { # # If this were a *real* Icon tokenizer, it would not emit # any record here, but would simply fail. Instead, we'll # emit a dummy record with a null sym field. # return TOK(&null, "\n") # &trace := 0 # fail } } # See above. Again, if this were like Icon's own tokenizer, we # would just fail here, and not return any TOK record. # # &trace := 0 return TOK(&null, "\n") # fail end # # do_number_sign: coexpression -> &null # getchar -> # # Where getchar is the coexpression that pops characters off the # main input stream. Sets the global variable next_c. This # procedure simply reads characters until it gets a newline, then # returns with next_c == "\n". Since the starting character was # a number sign, this has the effect of stripping comments. # procedure do_number_sign(getchar) # global next_c # write(&errout, "it's a number sign") while next_c ~== "\n" do { next_c := @getchar } # Return to calling procedure to cycle around again with the new # next_c already set. Next_c should always be "\n" at this point. return end # # do_quotation_mark: coexpression -> TOK record # getchar -> t # # Where getchar is the coexpression that yields another character # from the input stream, and t is a TOK record with "STRINGLIT" # as its sym field. Puts everything upto and including the next # non-backslashed quotation mark into the str field. Handles the # underscore continuation convention. # procedure do_quotation_mark(getchar) local token # global next_c # write(&errout, "it's a string literal") token := "\"" next_c := @getchar repeat { if next_c == "\n" & token[-1] == "_" then { token := token[1:-1] while any('\t ', next_c := @getchar) next } else { if slshupto('"', token ||:= next_c, 2) then { next_c := @getchar # resume outermost (repeat) loop in calling procedure, # with the new (here explicitly set) next_c return TOK("STRINGLIT", token) } next_c := @getchar } } end # # do_apostrophe: coexpression -> TOK record # getchar -> t # # Where getchar is the coexpression that yields another character # from the input stream, and t is a TOK record with "CSETLIT" # as its sym field. Puts everything upto and including the next # non-backslashed apostrope into the str field. # procedure do_apostrophe(getchar) local token # global next_c # write(&errout, "it's a cset literal") token := "'" next_c := @getchar repeat { if next_c == "\n" & token[-1] == "_" then { token := token[1:-1] while any('\t ', next_c := @getchar) next } else { if slshupto("'", token ||:= next_c, 2) then { next_c := @getchar # Return & resume outermost containing loop in calling # procedure w/ new next_c. return TOK("CSETLIT", token) } next_c := @getchar } } end # # do_digits: coexpression -> TOK record # getchar -> t # # Where getchar is the coexpression that produces the next char # on the input stream, and where t is a TOK record containing # either "REALLIT" or "INTLIT" in its sym field, and the text of # the numeric literal in its str field. # procedure do_digits(getchar) local token, tok_record, extras, digits, over # global next_c # For bases > 16 extras := "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz" # Assume integer literal until proven otherwise.... tok_record := TOK("INTLIT") # write(&errout, "it's an integer or real literal") token := ("0" ~== next_c) | "" while any(&digits, next_c := @getchar) do token ||:= next_c if token ||:= (next_c == ("R"|"r")) then { digits := &digits if over := ((10 < token[1:-1]) - 10) * 2 then digits ++:= extras[1:over+1] | extras next_c := @getchar if next_c == "-" then { token ||:= next_c next_c := @getchar } while any(digits, next_c) do { token ||:= next_c next_c := @getchar } } else { if token ||:= (next_c == ".") then { while any(&digits, next_c := @getchar) do token ||:= next_c tok_record := TOK("REALLIT") } if token ||:= (next_c == ("e"|"E")) then { next_c := @getchar if next_c == "-" then { token ||:= next_c next_c := @getchar } while any(&digits, next_c) do { token ||:= next_c next_c := @getchar } tok_record := TOK("REALLIT") } } tok_record.str := ("" ~== token) | "0" return tok_record end # # do_whitespace: coexpression x cset -> &null # getchar x whitespace -> &null # # Where getchar is the coexpression producing the next char on # the input stream. Do_whitespace just repeats until it finds a # non-whitespace character, whitespace being defined as # membership of a given character in the whitespace argument (a # cset). # procedure do_whitespace(getchar, whitespace) # write(&errout, "it's junk") while any(whitespace, next_c) do next_c := @getchar return end # # do_identifier: coexpression x table -> TOK record # (getchar, reserved_tbl) -> t # # Where getchar is the coexpression that pops off characters from # the input stream, reserved_tbl is a table of reserved words # (keys = the string values, values = the names qua symbols in # the grammar), and t is a TOK record containing all subsequent # letters, digits, or underscores after next_c (which must be a # letter or underscore). Note that next_c is global and gets # reset by do_identifier. # procedure do_identifier(getchar, reserved_tbl) local token # global next_c # write(&errout, "it's an indentifier") token := next_c while any(&letters ++ &digits ++ '_', next_c := @getchar) do token ||:= next_c return TOK(\reserved_tbl[token], token) | TOK("IDENT", token) end # # do_operator: coexpression x list -> TOK record # (getchar, operators) -> t # # Where getchar is the coexpression that produces the next # character on the input stream, operators is the operator list, # and where t is a TOK record describing the operator just # scanned. Calls recognop, which creates a DFSA to recognize # valid Icon operators. Arg2 (operators) is the list of lists # containing valid Icon operator string values and names (see # above). # procedure do_operator(getchar, operators) local token, elem token := next_c # Go until recognop fails. while elem := recognop(operators, token, 1) do token ||:= (next_c := @getchar) # write(&errout, ximage(elem)) if *\elem = 1 then return TOK(elem[1][2], elem[1][1]) else fail end record dfstn_state(b, e, tbl) record start_state(b, e, tbl, master_list) # # recognop: list x string x integer -> list # (l, s, i) -> l2 # # Where l is the list of lists created by the calling procedure # (each element contains a token string value, name, and # beginner/ender string), where s is a string possibly # corresponding to a token in the list, where i is the position in # the elements of l where the operator string values are recorded, # and where l2 is a list of elements from l that contain operators # for which string s is an exact match. Fails if there are no # operators that s is a prefix of, but returns an empty list if # there just aren't any that happen to match exactly. # # What this does is let the calling procedure just keep adding # characters to s until recognop fails, then check the last list # it returned to see if it is of length 1. If it is, then it # contains list with the vital stats for the operator last # recognized. If it is of length 0, then string s did not # contain any recognizable operator. # procedure recognop(l, s, i) local current_state, master_list, c, result, j static dfstn_table initial dfstn_table := table() /i := 1 # See if we've created an automaton for l already. /dfstn_table[l] := start_state(1, *l, &null, &null) & { dfstn_table[l].master_list := sortf(l, i) } current_state := dfstn_table[l] # Save master_list, as current_state will change later on. master_list := current_state.master_list s ? { while c := move(1) do { # Null means that this part of the automaton isn't # complete. # if /current_state.tbl then create_arcs(master_list, i, current_state, &pos) # If the table has been clobbered, then there are no arcs # leading out of the current state. Fail. # if current_state.tbl === 0 then fail # write(&errout, "c = ", image(c)) # write(&errout, "table for current state = ", # ximage(current_state.tbl)) # If we get to here, the current state has arcs leading # out of it. See if c is one of them. If so, make the # node to which arc c is connected the current state. # Otherwise fail. # current_state := \current_state.tbl[c] | fail } } # Return possible completions. # result := list() every j := current_state.b to current_state.e do { if *master_list[j][i] = *s then put(result, master_list[j]) } # return empty list if nothing the right length is found return result end # # create_arcs: fill out a table of arcs leading out of the current # state, and place that table in the tbl field for # current_state # procedure create_arcs(master_list, field, current_state, POS) local elem, i, first_char, old_first_char current_state.tbl := table() old_first_char := "" every elem := master_list[i := current_state.b to current_state.e][field] do { # Get the first character for the current position (note that # we're one character behind the calling routine; hence # POS-1). # first_char := elem[POS-1] | next # If we have a new first character, create a new arc out of # the current state. # if first_char ~== old_first_char then { # Store the start position for the current character. current_state.tbl[first_char] := dfstn_state(i) # Store the end position for the old character. (\current_state.tbl[old_first_char]).e := i-1 old_first_char := first_char } } (\current_state.tbl[old_first_char]).e := i # Clobber table with 0 if no arcs were added. current_state.tbl := (*current_state.tbl = 0) return current_state end