diff --git a/cmd/cmd.go b/cmd/cmd.go index bc111eb..ede32b3 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -5,6 +5,7 @@ import ( "flag" "fmt" "log" + "net/http" "os" "strconv" "text/template" @@ -51,6 +52,7 @@ Flags: -o (string): the filename where the data will be stored, if omitted the data will be dumped in STDOUT. + -scert (bool): skip the verification of the server's certificate chain and hostname. -V (bool): print the version of the program. -h (bool): print this usage message. ` @@ -71,13 +73,15 @@ type Args struct { PageFrom uint64 PageUntil uint64 Out string + SkipCert bool PrintVersion bool } func ParseArgs() (*Args, error) { args := Args{} - flag.StringVar(&args.Out, "o", "", "the filename to dump the missing-person posters data, if not present data is dumped into stdout") - flag.BoolVar(&args.PrintVersion, "V", false, "print the version of the program") + flag.StringVar(&args.Out, "o", "", "the filename where the data will be stored, if omitted the data will be dumped in STDOUT.") + flag.BoolVar(&args.SkipCert, "scert", false, "skip the verification of the server's certificate chain and hostname.") + flag.BoolVar(&args.PrintVersion, "V", false, "print the version of the program.") flag.Usage = Usage flag.Parse() if args.PrintVersion { @@ -128,7 +132,7 @@ func PrintVersion() { fmt.Println("rastreadora v0.0.1") } -func SelectScraperFuncs(scraper Scraper) (func(*goquery.Document) []mpp.MissingPersonPoster, func(uint64) string, error) { +func SelectScraperFuncs(scraper Scraper) (func(*goquery.Document, *http.Client) []mpp.MissingPersonPoster, func(uint64) string, error) { switch scraper { case ScraperGroAlba: return ws.ScrapeGroAlbaAlerts, ws.MakeGroAlbaUrl, nil @@ -153,10 +157,11 @@ func Execute(args *Args) { log.Fatalf("Error: %s", err) } ch := make(chan []mpp.MissingPersonPoster) + client := ws.MakeClient(args.SkipCert) for pageNum := args.PageFrom; pageNum <= args.PageUntil; pageNum++ { pageUrl := makeUrl(pageNum) log.Printf("Processing %s ...\n", pageUrl) - go ws.Scrape(pageUrl, scraper, ch) + go ws.Scrape(pageUrl, client, scraper, ch) } mpps := []mpp.MissingPersonPoster{} pagesCount := args.PageUntil - args.PageFrom + 1 diff --git a/ws/gro.go b/ws/gro.go index 975124a..f1513c5 100644 --- a/ws/gro.go +++ b/ws/gro.go @@ -2,6 +2,7 @@ package ws import ( "fmt" + "net/http" "strings" "time" @@ -51,7 +52,7 @@ func MakeGroAlbaUrl(pageNum uint64) string { return fmt.Sprintf("https://fiscaliaguerrero.gob.mx/category/alba/page/%d/", pageNum) } -func ScrapeGroAlbaAlerts(doc *goquery.Document) []mpp.MissingPersonPoster { +func ScrapeGroAlbaAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster { mpps := []mpp.MissingPersonPoster{} doc.Find(".article_content").Each(func(i int, s *goquery.Selection) { foundAndName := strings.TrimSpace(s.Find("h2 a").Text()) @@ -102,7 +103,7 @@ func MakeGroAmberUrl(pageNum uint64) string { return fmt.Sprintf("https://fiscaliaguerrero.gob.mx/category/amber/page/%d/", pageNum) } -func ScrapeGroAmberAlerts(doc *goquery.Document) []mpp.MissingPersonPoster { +func ScrapeGroAmberAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster { mpps := []mpp.MissingPersonPoster{} doc.Find(".article_content").Each(func(i int, s *goquery.Selection) { foundAndName := strings.TrimSpace(s.Find("h2 a").Text()) diff --git a/ws/mor.go b/ws/mor.go index a4d622d..1369771 100644 --- a/ws/mor.go +++ b/ws/mor.go @@ -2,6 +2,7 @@ package ws import ( "fmt" + "net/http" "strconv" "strings" "time" @@ -60,15 +61,15 @@ func MakeMorAmberUrl(pageNum uint64) string { return fmt.Sprintf("https://fiscaliamorelos.gob.mx/category/alerta-amber/page/%d/", pageNum) } -func ScrapeMorAmberPoPostUrl(pageUrl string) (string, error) { - doc, err := RetrieveDocument(pageUrl) +func ScrapeMorAmberPoPostUrl(pageUrl string, client *http.Client) (string, error) { + doc, err := RetrieveDocument(pageUrl, client) if err != nil { return "", fmt.Errorf("unable to retrieve the page %s", pageUrl) } return doc.Find("div .post-thumb-img-content img").AttrOr("src", ""), nil } -func ScrapeMorAmberAlerts(doc *goquery.Document) []mpp.MissingPersonPoster { +func ScrapeMorAmberAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster { mpps := []mpp.MissingPersonPoster{} doc.Find("article").Each(func(i int, s *goquery.Selection) { mpName := strings.Title(strings.TrimSpace(s.Find("h2").Text())) @@ -80,7 +81,7 @@ func ScrapeMorAmberAlerts(doc *goquery.Document) []mpp.MissingPersonPoster { return } poPostPublicationDate, _ := ParseMorDate(strings.TrimSpace(s.Find("span .published").Text())) - poPosterUrl, _ := ScrapeMorAmberPoPostUrl(poPostUrl) + poPosterUrl, _ := ScrapeMorAmberPoPostUrl(poPostUrl, client) mpps = append(mpps, mpp.MissingPersonPoster{ AlertType: mpp.AlertTypeAmber, MpName: mpName, @@ -97,7 +98,7 @@ func MakeMorCustomUrl(pageNum uint64) string { return fmt.Sprintf("https://fiscaliamorelos.gob.mx/cedulas/%d/", pageNum) } -func ScrapeMorCustomAlerts(doc *goquery.Document) []mpp.MissingPersonPoster { +func ScrapeMorCustomAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster { mpps := []mpp.MissingPersonPoster{} doc.Find("article").Each(func(i int, s *goquery.Selection) { mpName := strings.Title(strings.TrimSpace(s.Find("h3 a").Text())) diff --git a/ws/ws.go b/ws/ws.go index 78bd438..5118b4b 100644 --- a/ws/ws.go +++ b/ws/ws.go @@ -1,33 +1,47 @@ package ws import ( + "crypto/tls" + "log" "net/http" "github.com/PuerkitoBio/goquery" "github.com/midir99/rastreadora/mpp" ) -func RetrieveDocument(url string) (*goquery.Document, error) { - res, err := http.Get(url) +func MakeClient(skipCert bool) *http.Client { + if skipCert { + tr := &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + } + return &http.Client{Transport: tr} + } else { + return http.DefaultClient + } +} + +func RetrieveDocument(url string, client *http.Client) (*goquery.Document, error) { + resp, err := client.Get(url) if err != nil { return nil, err } - defer res.Body.Close() - if res.StatusCode != http.StatusOK { + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { return nil, err } - doc, err := goquery.NewDocumentFromReader(res.Body) + doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return nil, err } return doc, nil } -func Scrape(pageUrl string, scraper func(*goquery.Document) []mpp.MissingPersonPoster, ch chan []mpp.MissingPersonPoster) { - doc, err := RetrieveDocument(pageUrl) +func Scrape(pageUrl string, client *http.Client, scraper func(*goquery.Document, *http.Client) []mpp.MissingPersonPoster, ch chan []mpp.MissingPersonPoster) { + doc, err := RetrieveDocument(pageUrl, client) if err != nil { + log.Printf("Error: %s\n", err) ch <- []mpp.MissingPersonPoster{} return } - ch <- scraper(doc) + ch <- scraper(doc, client) }