############################################################################ # # File: weblinks.icn # # Subject: Program to check links in HTML files # # Author: Gregg M. Townsend # # Date: October 6, 2010 # ############################################################################ # # This file is in the public domain. # ############################################################################ # # Weblinks is a program for checking links in a collection of HTML # files. It is designed for use directly on the file structure # containing the HTML files. # # Given one or more starting points, weblinks parses each file and # validates the HTTP: and FILE: links it finds. Errors are reported # on standard output. FILE: links, including relative links, can be # followed recursively. # ############################################################################ # # By design, only local files are scanned. Only an existence check is # performed for HTTP: links. Validation of HTTP: links is aided by # caching and subject to speed limits; see "vhttp.icn" for details. # # Remote links are checked by sending an HTTP "HEAD" request. # Unfortunately, some sites respond with "Server Error" or even with # snide remarks like "Because I felt like it". These are reported # as errors and must be inspected manually. # # NOTE: if the environment variable USER is set, as it usually is, # then "From: $USER@hostname" is sent as part of each remote inquiry # in order to identify the source. This is standard etiquette for # automated checkers. If USER is not set, but LOGNAME is, then # $LOGNAME is used. # # Limitations: # url(...) links within embedded stylesheets are not recognized. # FTP:, MAILTO:, and other link types are not validated. # Files are checked recursively only if named *.htm*. # Proper file permission (for web export) is not checked. # # The common error of failing to put a trailing slash on a directory # specification results in a "453 Is A Directory" error message for a # local file or, typically, a "301 Moved Permanently" message for a # remote file. # ############################################################################ # # usage: weblinks [options] file... # # -R follow file links recursively # (http links are never followed recursively) # # -t trace files as visited # # -s report successes as well as problems # # -v report tracing and successes, if selected, more verbosely # # -i invert output (sort by referencing page, not by status) # # -r root # specify starting point for file names beginning with "/" # (e.g. -r /cs/www). This is needed if such references are # to be followed or checked. If a root is specified it # affects all file specifications including those on the # command line. # # -h home # specify starting point for file names beginning with "/~". # # -p prefix[,prefix...] # prune (don't check) files beginning with given prefix # # -b prefix # specify bounds for files scanned: do not scan files # that do not begin with prefix. Default bounds are # directory of last file name. For example, # weblinks /foo/bar /foo/baz # implies "-b /foo/". # # If the environment variable WEBLINKS_INIT is set, its whitespace- # separated words are prepended to the explicit command argument list. # ############################################################################ # # Examples (all assuming a web area rooted at /cs/www) # # To check one new page: # weblinks -r /cs/www /icon/books.htm # # To check a personal hierarchy, with tracing: # setenv WEBLINKS_INIT "-r /cs/www -h /cs/www/people" # weblinks -R -t /~gmt/ # # To check with pruning: # weblinks -R -t -r /cs/www -p /icon/library /icon/index.htm # ############################################################################ # # Links: options, strings, html, vhttp # ############################################################################ # # Requires: Unix, dynamic loading # ############################################################################ # to do: # add -u option (report unchecked URLs); -s should imply -u # provide way to ask for warnings about (e.g.) /http/html paths # provide way to specify translation from http:lww... into file: /... # provide way to specify translation from ftp:... into file: /... # provide depth limit control # allow longer history persistence # history is clumsy -- hard to recheck a connection that failed # add option to retry failed entries (but believe cached successes) $define URLCOLS 56 # number of columns allotted for tracing URLs $define STATCOLS 22 # number of columns allotted for status messages link options link strings link html link vhttp global root global home global prune global bounds global invert global recurse global trace global verbose global successes global todo, done, nscanned global refto, reffrom procedure main(args) local opts, url, tmp # initialize data structures prune := list() todo := list() done := table() refto := table() reffrom := table() nscanned := 0 # add arguments from the environment to the command line tmp := list() every put(tmp, words(getenv("WEBLINKS_INIT"))) while push(args, pull(tmp)) # process command line opts := options(args, "b:p:r:h:iRstv") invert := opts["i"] recurse := opts["R"] successes := opts["s"] trace := opts["t"] verbose := opts["v"] if *args = 0 then stop("usage: ", &progname, " [options] file ...") setroot(\opts["r"] | "/") sethome(\opts["h"] | "/usr/") setbounds(\opts["b"] | urlmerge(args[-1], "")) every setprune(words(\opts["p"], ' ,')) setfrom() register("initial:") register("implicit:") every addref("initial:", urlmerge("file:", !args)) wheader() while url := get(todo) do try(url) if \trace then write() report() end procedure setroot(s) if s[-1] ~== "/" then s ||:= "/" root := s return end procedure sethome(s) if s[-1] ~== "/" then s ||:= "/" home := s return end procedure setprune(s) put(prune, s) return end procedure setbounds(s) bounds := s return end procedure setfrom() local user, host, f user := getenv("USER" | "LOGNAME") | fail *user > 0 | fail f := open("uname -n", "rp") | fail host := read(f) close(f) *\host > 0 | fail vhttp_from := user || "@" || host return end procedure wheader() write("From:\t", \vhttp_from | "[none]") write("root:\t", root) write("home:\t", home) write("bounds:\t", bounds) every write("start:\t", (!todo)[6:0]) every write("prune:\t", !prune) write() return end procedure try(url) local result (/done[url] := "[processing]") | return # return if already checked if \trace then { writes(pad(url, URLCOLS)) flush(&output) } result := check(url) done[url] := result if \trace then write(" ", result) return end procedure check(url) local protocol, fspec, fname, f, s, ref, base url ? { protocol := map(tab(upto(':'))) | "" =":" fspec := tab(0) } if protocol == "http" then return vhttp(url) | "451 Illegal URL" if protocol ~== "file" then return "152 Not Checked" fspec ? { if ="/~" then fname := home || tab(0) else if ="/" then fname := root || tab(0) else if pos(0) then fname := "./" else fname := fspec } if fname[-1] == "/" then { if (close(open(fname || "index.html"))) then { addref("implicit:", url || "index.html") return "154 Found index.html" } if (close(open(fname || "index.htm"))) then { addref("implicit:", url || "index.htm") return "155 Found index.htm" } if (close(open(fname || "."))) then return "153 Found Directory" } if not (f := open(fname)) then return "452 Cannot Open" if (/recurse & not member(reffrom["initial:"], url)) | (fspec ? (not match(bounds)) | match(!prune)) | (not find(".htm", map(url))) then { close(f) if close(open(fname || "/.")) then return "453 Is A Directory" else return "251 File Exists" } base := url every s := htrefs(f) do s ? { if ="BASE HREF " then { base := tab(0) } else { tab(upto(' ') + 1) tab(upto(' ') + 1) ref := urlmerge(base, tab(0)) addref(url, ref) } if \verbose then writes("\n references: ", ref) } if \verbose then writes("\n", repl(" ", URLCOLS)) close(f) nscanned +:= 1 return "252 File Scanned" end procedure report() local l, url, stat, s, t, u l := sort(done, 4) t := table() while (url := get(l)) & (stat := get(l)) do { if \successes | (any('3456789', stat) & stat ~== "302 Found") then { s := pad(stat || ":", STATCOLS) || " " || url if \invert then every u := !refto[url] do put(\t[u] | (t[u] := []), s) else { write(s) if \verbose | any('3456789', stat) then every write(" referenced by:\t", !sort(refto[url])) } } } if \invert then { l := sort(t, 3) while (url := get(l)) & (stat := get(l)) do { write(url) every write(" ", !stat) } } write() if nscanned = 1 then write("1 file scanned") else write(nscanned, " files scanned") if *done = 1 then write("1 reference checked") else write(*done, " references checked") return end procedure addref(src, dst) dst := (dst ? tab(upto('#') | 0)) register(dst) insert(refto[dst], src) insert(reffrom[src], dst) if /done[dst] then put(todo, dst) return end procedure register(url) /refto[url] := set() /reffrom[url] := set() return end # pad(s, n) -- pad string to length n, never truncating procedure pad(s, n) if *s < n then return left(s, n) else return s end