-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhtml.go
91 lines (79 loc) · 1.78 KB
/
html.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package main
import (
"bufio"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
neturl "net/url"
"strings"
)
type HTMLFetcher struct {
MaxItemSize int64
Client *http.Client
}
func (h HTMLFetcher) Generate(query neturl.Values) (content []byte, err error) {
url := query.Get("url")
// 0 to disable ajax crawling. default to true.
ajaxCrawling := query.Get("ajax") != "0"
if ajaxCrawling {
url = escapeFragment(url)
}
resp, err := h.Client.Get(url)
if err != nil {
return
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
err = StatusCodeError{url, resp.StatusCode}
return
}
var r io.Reader = resp.Body
if h.MaxItemSize > 0 {
r = io.LimitReader(resp.Body, h.MaxItemSize)
}
buffered := bufio.NewReader(r)
// Check Content Type
header, _ := buffered.Peek(512)
contentType := http.DetectContentType(header)
if !strings.HasPrefix(contentType, "text/") {
return
}
// Now read all remaining
buf, err := ioutil.ReadAll(buffered)
if err != nil {
return
}
if ajaxCrawling {
if newurl, escaped := escapeFragmentMeta(url, buf); escaped {
query.Set("url", newurl)
return h.Generate(query)
}
}
originalUrl := resp.Request.URL.String()
if ajaxCrawling {
originalUrl = unescapeFragment(originalUrl)
}
return json.Marshal(fetchResponse{originalUrl, buf})
}
func (h HTMLFetcher) WriteResponse(w http.ResponseWriter, cached []byte) error {
fp := fetchResponse{}
if err := json.Unmarshal(cached, &fp); err != nil {
return err
}
w.Header().Set("X-Real-URL", fp.URL)
_, err := w.Write(fp.Content)
return err
}
type StatusCodeError struct {
URL string
Code int
}
func (r StatusCodeError) Error() string {
return fmt.Sprintf("Response code %d for URL: %s", r.Code, r.URL)
}
type fetchResponse struct {
URL string
Content []byte
}