Skip to content

Commit

Permalink
New flag added to skip the verification of the certificate
Browse files Browse the repository at this point in the history
  • Loading branch information
midir99 committed Jul 16, 2022
1 parent edbc73e commit 2f2241d
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 19 deletions.
13 changes: 9 additions & 4 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"flag"
"fmt"
"log"
"net/http"
"os"
"strconv"
"text/template"
Expand Down Expand Up @@ -51,6 +52,7 @@ Flags:
-o (string): the filename where the data will be stored, if omitted the data will be
dumped in STDOUT.
-scert (bool): skip the verification of the server's certificate chain and hostname.
-V (bool): print the version of the program.
-h (bool): print this usage message.
`
Expand All @@ -71,13 +73,15 @@ type Args struct {
PageFrom uint64
PageUntil uint64
Out string
SkipCert bool
PrintVersion bool
}

func ParseArgs() (*Args, error) {
args := Args{}
flag.StringVar(&args.Out, "o", "", "the filename to dump the missing-person posters data, if not present data is dumped into stdout")
flag.BoolVar(&args.PrintVersion, "V", false, "print the version of the program")
flag.StringVar(&args.Out, "o", "", "the filename where the data will be stored, if omitted the data will be dumped in STDOUT.")
flag.BoolVar(&args.SkipCert, "scert", false, "skip the verification of the server's certificate chain and hostname.")
flag.BoolVar(&args.PrintVersion, "V", false, "print the version of the program.")
flag.Usage = Usage
flag.Parse()
if args.PrintVersion {
Expand Down Expand Up @@ -128,7 +132,7 @@ func PrintVersion() {
fmt.Println("rastreadora v0.0.1")
}

func SelectScraperFuncs(scraper Scraper) (func(*goquery.Document) []mpp.MissingPersonPoster, func(uint64) string, error) {
func SelectScraperFuncs(scraper Scraper) (func(*goquery.Document, *http.Client) []mpp.MissingPersonPoster, func(uint64) string, error) {
switch scraper {
case ScraperGroAlba:
return ws.ScrapeGroAlbaAlerts, ws.MakeGroAlbaUrl, nil
Expand All @@ -153,10 +157,11 @@ func Execute(args *Args) {
log.Fatalf("Error: %s", err)
}
ch := make(chan []mpp.MissingPersonPoster)
client := ws.MakeClient(args.SkipCert)
for pageNum := args.PageFrom; pageNum <= args.PageUntil; pageNum++ {
pageUrl := makeUrl(pageNum)
log.Printf("Processing %s ...\n", pageUrl)
go ws.Scrape(pageUrl, scraper, ch)
go ws.Scrape(pageUrl, client, scraper, ch)
}
mpps := []mpp.MissingPersonPoster{}
pagesCount := args.PageUntil - args.PageFrom + 1
Expand Down
5 changes: 3 additions & 2 deletions ws/gro.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package ws

import (
"fmt"
"net/http"
"strings"
"time"

Expand Down Expand Up @@ -51,7 +52,7 @@ func MakeGroAlbaUrl(pageNum uint64) string {
return fmt.Sprintf("https://fiscaliaguerrero.gob.mx/category/alba/page/%d/", pageNum)
}

func ScrapeGroAlbaAlerts(doc *goquery.Document) []mpp.MissingPersonPoster {
func ScrapeGroAlbaAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster {
mpps := []mpp.MissingPersonPoster{}
doc.Find(".article_content").Each(func(i int, s *goquery.Selection) {
foundAndName := strings.TrimSpace(s.Find("h2 a").Text())
Expand Down Expand Up @@ -102,7 +103,7 @@ func MakeGroAmberUrl(pageNum uint64) string {
return fmt.Sprintf("https://fiscaliaguerrero.gob.mx/category/amber/page/%d/", pageNum)
}

func ScrapeGroAmberAlerts(doc *goquery.Document) []mpp.MissingPersonPoster {
func ScrapeGroAmberAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster {
mpps := []mpp.MissingPersonPoster{}
doc.Find(".article_content").Each(func(i int, s *goquery.Selection) {
foundAndName := strings.TrimSpace(s.Find("h2 a").Text())
Expand Down
11 changes: 6 additions & 5 deletions ws/mor.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package ws

import (
"fmt"
"net/http"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -60,15 +61,15 @@ func MakeMorAmberUrl(pageNum uint64) string {
return fmt.Sprintf("https://fiscaliamorelos.gob.mx/category/alerta-amber/page/%d/", pageNum)
}

func ScrapeMorAmberPoPostUrl(pageUrl string) (string, error) {
doc, err := RetrieveDocument(pageUrl)
func ScrapeMorAmberPoPostUrl(pageUrl string, client *http.Client) (string, error) {
doc, err := RetrieveDocument(pageUrl, client)
if err != nil {
return "", fmt.Errorf("unable to retrieve the page %s", pageUrl)
}
return doc.Find("div .post-thumb-img-content img").AttrOr("src", ""), nil
}

func ScrapeMorAmberAlerts(doc *goquery.Document) []mpp.MissingPersonPoster {
func ScrapeMorAmberAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster {
mpps := []mpp.MissingPersonPoster{}
doc.Find("article").Each(func(i int, s *goquery.Selection) {
mpName := strings.Title(strings.TrimSpace(s.Find("h2").Text()))
Expand All @@ -80,7 +81,7 @@ func ScrapeMorAmberAlerts(doc *goquery.Document) []mpp.MissingPersonPoster {
return
}
poPostPublicationDate, _ := ParseMorDate(strings.TrimSpace(s.Find("span .published").Text()))
poPosterUrl, _ := ScrapeMorAmberPoPostUrl(poPostUrl)
poPosterUrl, _ := ScrapeMorAmberPoPostUrl(poPostUrl, client)
mpps = append(mpps, mpp.MissingPersonPoster{
AlertType: mpp.AlertTypeAmber,
MpName: mpName,
Expand All @@ -97,7 +98,7 @@ func MakeMorCustomUrl(pageNum uint64) string {
return fmt.Sprintf("https://fiscaliamorelos.gob.mx/cedulas/%d/", pageNum)
}

func ScrapeMorCustomAlerts(doc *goquery.Document) []mpp.MissingPersonPoster {
func ScrapeMorCustomAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster {
mpps := []mpp.MissingPersonPoster{}
doc.Find("article").Each(func(i int, s *goquery.Selection) {
mpName := strings.Title(strings.TrimSpace(s.Find("h3 a").Text()))
Expand Down
30 changes: 22 additions & 8 deletions ws/ws.go
Original file line number Diff line number Diff line change
@@ -1,33 +1,47 @@
package ws

import (
"crypto/tls"
"log"
"net/http"

"github.com/PuerkitoBio/goquery"
"github.com/midir99/rastreadora/mpp"
)

func RetrieveDocument(url string) (*goquery.Document, error) {
res, err := http.Get(url)
func MakeClient(skipCert bool) *http.Client {
if skipCert {
tr := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
return &http.Client{Transport: tr}
} else {
return http.DefaultClient
}
}

func RetrieveDocument(url string, client *http.Client) (*goquery.Document, error) {
resp, err := client.Get(url)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, err
}
doc, err := goquery.NewDocumentFromReader(res.Body)
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, err
}
return doc, nil
}

func Scrape(pageUrl string, scraper func(*goquery.Document) []mpp.MissingPersonPoster, ch chan []mpp.MissingPersonPoster) {
doc, err := RetrieveDocument(pageUrl)
func Scrape(pageUrl string, client *http.Client, scraper func(*goquery.Document, *http.Client) []mpp.MissingPersonPoster, ch chan []mpp.MissingPersonPoster) {
doc, err := RetrieveDocument(pageUrl, client)
if err != nil {
log.Printf("Error: %s\n", err)
ch <- []mpp.MissingPersonPoster{}
return
}
ch <- scraper(doc)
ch <- scraper(doc, client)
}

0 comments on commit 2f2241d

Please sign in to comment.