Skip to content

Commit

Permalink
Add the option -skip-verify to skip the cert verification, add new Do…
Browse files Browse the repository at this point in the history
…c struct (handy way to explore the HTML document using cascadia), add scraper for CDMX custom alerts
  • Loading branch information
midir99 committed Aug 8, 2022
1 parent da69295 commit 335b692
Show file tree
Hide file tree
Showing 8 changed files with 1,332 additions and 97 deletions.
30 changes: 16 additions & 14 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ import (
"strings"
"text/template"

"github.com/midir99/rastreadora/doc"
"github.com/midir99/rastreadora/mpp"
"github.com/midir99/rastreadora/ws"
"golang.org/x/net/html"
)

type AlertType string

const (
AlertTypeCdmxCustom = "cdmx-custom"
AlertTypeGroAlba = "gro-alba"
AlertTypeGroAmber = "gro-amber"
AlertTypeGroHasVistoA = "gro-hasvistoa"
Expand All @@ -27,6 +28,7 @@ const (

func AlertTypesAvailable() []AlertType {
return []AlertType{
AlertTypeCdmxCustom,
AlertTypeGroAlba,
AlertTypeGroAmber,
AlertTypeGroHasVistoA,
Expand All @@ -52,10 +54,11 @@ Arguments:
Flags:
-o (string): the filename where the data will be stored, if omitted the data will be
dumped in STDOUT.
-V (bool): print the version of the program.
-h (bool): print this usage message.
-o (string): the filename where the data will be stored, if omitted the data will be
dumped in STDOUT.
-skip-verify (bool): skip the verification of the server's certificate chain and hostname.
-V (bool): print the version of the program.
-h (bool): print this usage message.
`

func Usage() {
Expand All @@ -74,14 +77,14 @@ type Args struct {
PageFrom uint64
PageUntil uint64
Out string
SkipCert bool
SkipVerify bool
PrintVersion bool
}

func ParseArgs() (*Args, error) {
args := Args{}
flag.StringVar(&args.Out, "o", "", "the filename where the data will be stored, if omitted the data will be dumped in STDOUT.")
flag.BoolVar(&args.SkipCert, "scert", false, "skip the verification of the server's certificate chain and hostname.")
flag.BoolVar(&args.SkipVerify, "skip-verify", false, "skip the verification of the server's certificate chain and hostname.")
flag.BoolVar(&args.PrintVersion, "V", false, "print the version of the program.")
flag.Usage = Usage
flag.Parse()
Expand Down Expand Up @@ -133,8 +136,10 @@ func PrintVersion() {
fmt.Println("rastreadora v0.4.0")
}

func SelectScraperFuncs(alertType AlertType) (func(*html.Node) ([]mpp.MissingPersonPoster, map[int]error), func(uint64) string, error) {
func SelectScraperFuncs(alertType AlertType) (func(*doc.Doc) ([]mpp.MissingPersonPoster, map[int]error), func(uint64) string, error) {
switch alertType {
case AlertTypeCdmxCustom:
return ws.ScrapeCdmxCustomAlerts, ws.MakeCdmxCustomUrl, nil
case AlertTypeGroAlba:
return ws.ScrapeGroAlbaAlerts, ws.MakeGroAlbaUrl, nil
case AlertTypeGroAmber:
Expand Down Expand Up @@ -164,10 +169,9 @@ func mppLegend(mpps int) string {
return "missing person posters"
}

func Scrape(pageUrl string, scraper func(*html.Node) ([]mpp.MissingPersonPoster, map[int]error), ch chan []mpp.MissingPersonPoster) {
doc, err := ws.RetrieveDocument(pageUrl)
func Scrape(pageUrl string, scraper func(*doc.Doc) ([]mpp.MissingPersonPoster, map[int]error), skipVerify bool, ch chan []mpp.MissingPersonPoster) {
doc, err := ws.RetrieveDocument(pageUrl, skipVerify)
if err != nil {
// log.Printf("Done processing %s; 0 entries collected; %s", pageUrl, err)
log.Printf("0 entries collected from %s; %s", pageUrl, err)
ch <- []mpp.MissingPersonPoster{}
return
Expand All @@ -181,10 +185,8 @@ func Scrape(pageUrl string, scraper func(*html.Node) ([]mpp.MissingPersonPoster,
messages = append(messages, fmt.Sprintf("entry #%d: %s", entryNumber, err))
}
message := strings.Join(messages, ",")
// log.Printf("Done processing %s; %d %s collected; unable to retrieve %d, details: %s", pageUrl, mppsLen, entryWord, errsLen, message)
log.Printf("%d %s collected from %s; unable to retrieve %d, details: %s", mppsLen, entryWord, pageUrl, errsLen, message)
} else {
// log.Printf("Done processing %s; %d %s collected", pageUrl, mppsLen, entryWord)
log.Printf("%d %s collected from %s", mppsLen, entryWord, pageUrl)
}
ch <- mpps
Expand All @@ -202,7 +204,7 @@ func Execute(args *Args) {
ch := make(chan []mpp.MissingPersonPoster)
for pageNum := args.PageFrom; pageNum <= args.PageUntil; pageNum++ {
pageUrl := makeUrl(pageNum)
go Scrape(pageUrl, scraper, ch)
go Scrape(pageUrl, scraper, args.SkipVerify, ch)
}
mpps := []mpp.MissingPersonPoster{}
pagesCount := args.PageUntil - args.PageFrom + 1
Expand Down
71 changes: 71 additions & 0 deletions doc/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package doc

import (
"bytes"

"github.com/andybalholm/cascadia"
"golang.org/x/net/html"
)

type Doc struct {
*html.Node
}

func (d *Doc) Query(query string) *Doc {
sel, err := cascadia.Parse(query)
if err != nil {
return &Doc{}
}
node := cascadia.Query(d.Node, sel)
if node == nil {
return &Doc{Node: &html.Node{}}
}
return &Doc{Node: node}
}

func (d *Doc) QueryAll(query string) []*Doc {
sel, err := cascadia.Parse(query)
if err != nil {
return []*Doc{}
}
docs := []*Doc{}
for _, node := range cascadia.QueryAll(d.Node, sel) {
docs = append(docs, &Doc{Node: node})
}
return docs
}

func (d *Doc) NthChild(n int) *Doc {
p := 0
for child := d.Node.FirstChild; child != nil; child = child.NextSibling {
if p == n {
return &Doc{Node: child}
}
p++
}
return &Doc{}
}

func (d *Doc) Text() string {
var buf bytes.Buffer
var f func(node *html.Node)
f = func(node *html.Node) {
if node.Type == html.TextNode {
buf.WriteString(node.Data)
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
f(child)
}
}
f(d.Node)
return buf.String()
}

func (d *Doc) AttrOr(attr, or string) string {
for _, a := range d.Node.Attr {
if a.Key == attr {
return a.Val
}
}
return or
}
12 changes: 6 additions & 6 deletions mpp/mpp.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,13 @@ const (

type MissingPersonPoster struct {
MpName string
MpHeight uint
MpWeight uint
MpHeight int
MpWeight int
MpPhysicalBuild PhysicalBuild
MpComplexion Complexion
MpSex Sex
MpDob time.Time
MpAgeWhenDisappeared uint
MpAgeWhenDisappeared int
MpEyesDescription string
MpHairDescription string
MpOutfitDescription string
Expand Down Expand Up @@ -122,13 +122,13 @@ func (m MissingPersonPoster) MarshalJSON() ([]byte, error) {
}
basicMpp := struct {
MpName string `json:"mp_name"`
MpHeight uint `json:"mp_height,omitempty"`
MpWeight uint `json:"mp_weight,omitempty"`
MpHeight int `json:"mp_height,omitempty"`
MpWeight int `json:"mp_weight,omitempty"`
MpPhysicalBuild string `json:"mp_physical_build,omitempty"`
MpComplexion string `json:"mp_complexion,omitempty"`
MpSex string `json:"mp_sex,omitempty"`
MpDob string `json:"mp_dob,omitempty"`
MpAgeWhenDisappeared uint `json:"mp_age_when_disappeared,omitempty"`
MpAgeWhenDisappeared int `json:"mp_age_when_disappeared,omitempty"`
MpEyesDescription string `json:"mp_eyes_description,omitempty"`
MpHairDescription string `json:"mp_hair_description,omitempty"`
MpOutfitDescription string `json:"mp_outfit_description,omitempty"`
Expand Down
147 changes: 147 additions & 0 deletions ws/cdmx.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
package ws

import (
"fmt"
"net/url"
"strconv"
"strings"
"time"

"github.com/midir99/rastreadora/doc"
"github.com/midir99/rastreadora/mpp"
"golang.org/x/text/cases"
"golang.org/x/text/language"
)

func ParseCdmxDate(value string) (time.Time, error) {
date := strings.Split(strings.ToLower(value), " ")
if len(date) != 5 {
return time.Time{}, fmt.Errorf("unable to parse date %s", value)
}
DAY, MONTH, YEAR := 0, 2, 4
day, err := strconv.ParseUint(date[DAY], 10, 64)
if err != nil {
return time.Time{}, fmt.Errorf("unable to parse date %s (invalid day number: %s)", value, date[DAY])
}
var month time.Month
switch date[MONTH] {
case "enero":
month = time.January
case "febrero":
month = time.February
case "marzo":
month = time.March
case "abril":
month = time.April
case "mayo":
month = time.May
case "junio":
month = time.June
case "julio":
month = time.July
case "agosto":
month = time.August
case "septiembre":
month = time.September
case "octubre":
month = time.October
case "noviembre":
month = time.November
case "diciembre":
month = time.December
default:
return time.Time{}, fmt.Errorf("unable to parse date %s (invalid month: %s)", value, month)
}
year, err := strconv.ParseUint(date[YEAR], 10, 64)
if err != nil {
return time.Time{}, fmt.Errorf("unable to parse date %s (invalid year number: %s)", value, date[YEAR])
}
return time.Date(int(year), month, int(day), 0, 0, 0, 0, time.UTC), nil
}

func ParseCdmxFound(value string) bool {
switch strings.ToLower(value) {
case "localizado":
return true
case "no localizado":
return false
case "ausente":
return false
default:
return false
}
}

func ParseCdmxAge(value string) (int, error) {
age := strings.Split(value, " ")
YEARS := 0
years, err := strconv.ParseUint(age[YEARS], 10, 64)
if err != nil {
return 0, fmt.Errorf("unable to parse age %s", value)
}
return int(years), nil
}

func MakeCdmxCustomUrl(pageNum uint64) string {
return fmt.Sprintf("https://personasdesaparecidas.fgjcdmx.gob.mx/listado.php?pa=%d&re=100", pageNum)
}

func ScrapeCdmxCustomAlerts(d *doc.Doc) ([]mpp.MissingPersonPoster, map[int]error) {
mpps := []mpp.MissingPersonPoster{}
errs := make(map[int]error)
for i, tr := range d.QueryAll("tbody tr") {
tds := tr.QueryAll("td")
if len(tds) != 2 {
errs[i+1] = fmt.Errorf("entry only has not 2 td elements")
continue
}
posterTd := tds[0]
dataTd := tds[1]
mpName := cases.Title(language.LatinAmericanSpanish).String(strings.ReplaceAll(strings.TrimSpace(dataTd.NthChild(0).Text()), "\u00A0", " "))
if mpName == "" {
errs[i+1] = fmt.Errorf("MpName can't be empty")
continue
}
postUrl := strings.TrimSpace(dataTd.NthChild(10).AttrOr("href", ""))
if postUrl == "" {
errs[i+1] = fmt.Errorf("PoPostUrl can't be empty")
continue
}
poPostUrl, err := url.Parse("https://personasdesaparecidas.fgjcdmx.gob.mx/" + postUrl)
if err != nil {
errs[i+1] = fmt.Errorf("can't parse PoPostUrl: %s", err)
continue
}
var poPosterUrl *url.URL
posterUrl := strings.TrimSpace(posterTd.Query("img").AttrOr("src", ""))
if posterUrl != "" {
posterUrl = "https://personasdesaparecidas.fgjcdmx.gob.mx/" + posterUrl
poPosterUrl, _ = url.Parse(posterUrl)
}
var missingDate time.Time
missingDateLegend := strings.Split(dataTd.NthChild(4).Text(), ":\u00A0")
if len(missingDateLegend) == 2 {
missingDate, _ = ParseCdmxDate(missingDateLegend[1])
}
var found bool
foundLegend := strings.Split(dataTd.NthChild(8).Text(), ":\u00A0")
if len(foundLegend) == 2 {
found = ParseCdmxFound(foundLegend[1])
}
var age int
ageLegend := strings.Split(dataTd.NthChild(2).Text(), ":\u00A0")
if len(ageLegend) == 2 {
age, _ = ParseCdmxAge(ageLegend[1])
}
mpps = append(mpps, mpp.MissingPersonPoster{
Found: found,
MissingDate: missingDate,
MpAgeWhenDisappeared: age,
MpName: mpName,
PoPosterUrl: poPosterUrl,
PoPostUrl: poPostUrl,
PoState: mpp.StateCiudadDeMexico,
})
}
return mpps, errs
}
Loading

0 comments on commit 335b692

Please sign in to comment.