-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
84 lines (69 loc) · 2.55 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
package main
import (
"flag"
"fmt"
"os"
"regexp"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/extensions"
)
func main() {
// parse command line arguments
const usage = `Usage: check-katex [-u|--start-url ...] [-h|--help]
-u, --start-url Starting point of the crawler (default: https://ems.press/journals)
-h, --help prints help information
Will only check URLs deeper than the given start URL. All errors are printed to
stderr, verbose request information is printed to stdout.
Examples:
go run main.go > /dev/null # only print errors
go run main.go 2> >(tee errors.log) # save all errors to a file
`
startUrl := "https://ems.press/journals"
flag.StringVar(&startUrl, "start-url", startUrl, "Start point of the crawler")
flag.StringVar(&startUrl, "u", startUrl, "Start point of the crawler")
flag.Usage = func() { fmt.Print(usage) }
flag.Parse()
// code 0: no errors found, code 1: katex errors found, code 255: encountered http errors
exitCode := 0
defer func() {
os.Exit(exitCode)
}()
collector := colly.NewCollector(
colly.UserAgent("ems.press check-katex"),
colly.URLFilters(
// only look at urls deeper than the given start url:
regexp.MustCompile(startUrl+".*?"),
),
colly.Async(),
)
collector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 8})
extensions.Referer(collector)
// check all links
collector.OnHTML("a[href]", func(element *colly.HTMLElement) {
element.Request.Visit(element.Attr("href"))
})
// check all rendered formulae for errors
collector.OnHTML("span.katex", func(element *colly.HTMLElement) {
katexErrors := element.DOM.Find("span[style*=color]")
if len(katexErrors.Nodes) > 0 {
exitCode = 1
latexSrc := element.DOM.Find("annotation[encoding=\"application/x-tex\"]").Text()
fmt.Fprintf(os.Stderr, "Error: malformatted latex src $%v$ on URL %s; Parse error at \"%s\"\n", latexSrc, element.Request.URL, katexErrors.First().Text())
}
})
// print some info about visited pages to stdout
collector.OnResponse(func(response *colly.Response) {
fmt.Printf("Checked %s\n", response.Request.URL)
})
collector.OnError(func(response *colly.Response, err error) {
if response.StatusCode == 503 || response.StatusCode == 999 || response.StatusCode == 0 {
// ignore 503 and 999 and 0 status code to avoid flaky errors
return
}
exitCode = 255
request := response.Request
fmt.Fprintf(os.Stderr, "Error: \"%v %s\" while visiting %s; Found on: %s\n", response.StatusCode, err, request.URL, request.Headers.Get("Referer"))
})
collector.Visit(startUrl)
collector.Wait()
}