-
Notifications
You must be signed in to change notification settings - Fork 4
/
minion.go
110 lines (96 loc) · 2.41 KB
/
minion.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package scrape
import (
"context"
"fmt"
"log"
"net/http"
"net/url"
"strings"
"sync"
)
// minion crawls the link, scrape urls normalises then and returns the dump to gru
type minion struct {
name string
busy bool // busy represents whether minion is idle/busy
mu *sync.RWMutex // protects the above
payloadCh chan *minionPayload // payload listens for urls to be scrapped
gruDumpCh chan<- *minionDumps // gruDumpCh to send finished data to gru
}
// newMinion returns a new minion under given gru
func newMinion(name string, gruDumpCh chan<- *minionDumps) *minion {
return &minion{
name: name,
mu: &sync.RWMutex{},
payloadCh: make(chan *minionPayload),
gruDumpCh: gruDumpCh,
}
}
// isBusy says if the minion is busy or idle
func isBusy(m *minion) bool {
m.mu.RLock()
defer m.mu.RUnlock()
return m.busy
}
// crawlURL crawls the url and extracts the urls from the page
func crawlURL(depth int, u *url.URL) (md *minionDump) {
resp, err := http.DefaultClient.Get(u.String())
if err != nil {
return &minionDump{
depth: depth + 1,
sourceURL: u,
err: err,
}
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return &minionDump{
depth: depth + 1,
sourceURL: u,
err: fmt.Errorf("url responsed with code %d", resp.StatusCode),
}
}
ct := resp.Header.Get("Content-type")
if ct != "" && !strings.Contains(ct, "text/html") {
return &minionDump{
depth: depth + 1,
sourceURL: u,
err: fmt.Errorf("unknown content type: %s", ct),
}
}
s, iu := extractURLsFromHTML(u, resp.Body)
return &minionDump{
depth: depth + 1,
sourceURL: u,
urls: s,
invalidURLs: iu,
}
}
// crawlURLs crawls given urls and return extracted url from the page
func crawlURLs(depth int, urls []*url.URL) (mds []*minionDump) {
for _, u := range urls {
mds = append(mds, crawlURL(depth, u))
}
return mds
}
// startMinion starts the minion
func startMinion(ctx context.Context, m *minion) {
log.Printf("Starting %s...\n", m.name)
for {
select {
case <-ctx.Done():
return
case mp := <-m.payloadCh:
m.busy = true
log.Printf("Crawling urls(%d) from depth %d\n", len(mp.urls), mp.currentDepth)
mds := crawlURLs(mp.currentDepth, mp.urls)
got := make(chan bool)
m.gruDumpCh <- &minionDumps{
minion: m.name,
got: got,
mds: mds,
}
<-got
m.busy = false
}
}
}