From 4eeb1b8ce30136002916584c35cac0dfe5b475fd Mon Sep 17 00:00:00 2001 From: midir99 Date: Sun, 17 Jul 2022 22:40:15 -0500 Subject: [PATCH] Add support to scrape Has Visto A alerts from the GRO prosecutor's office website --- cmd/cmd.go | 14 +- mpp/mpp.go | 110 +++++-- rawpages/gro/hva-alerts-page.html | 531 ++++++++++++++++++++++++++++++ ws/gro.go | 67 +++- ws/mor.go | 12 +- 5 files changed, 689 insertions(+), 45 deletions(-) create mode 100644 rawpages/gro/hva-alerts-page.html diff --git a/cmd/cmd.go b/cmd/cmd.go index ede32b3..793779d 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -18,16 +18,18 @@ import ( type Scraper string const ( - ScraperGroAlba = "gro-alba" - ScraperGroAmber = "gro-amber" - ScraperMorAmber = "mor-amber" - ScraperMorCustom = "mor-custom" + ScraperGroAlba = "gro-alba" + ScraperGroAmber = "gro-amber" + ScraperGroHasVistoA = "gro-hasvistoa" + ScraperMorAmber = "mor-amber" + ScraperMorCustom = "mor-custom" ) func ScrapersAvailable() []Scraper { return []Scraper{ ScraperGroAlba, ScraperGroAmber, + ScraperGroHasVistoA, ScraperMorAmber, ScraperMorCustom, } @@ -129,7 +131,7 @@ func ParseArgs() (*Args, error) { } func PrintVersion() { - fmt.Println("rastreadora v0.0.1") + fmt.Println("rastreadora v0.2.0") } func SelectScraperFuncs(scraper Scraper) (func(*goquery.Document, *http.Client) []mpp.MissingPersonPoster, func(uint64) string, error) { @@ -138,6 +140,8 @@ func SelectScraperFuncs(scraper Scraper) (func(*goquery.Document, *http.Client) return ws.ScrapeGroAlbaAlerts, ws.MakeGroAlbaUrl, nil case ScraperGroAmber: return ws.ScrapeGroAmberAlerts, ws.MakeGroAmberUrl, nil + case ScraperGroHasVistoA: + return ws.ScrapeGroHasVistoAAlerts, ws.MakeGroHasVistoAAlertsUrl, nil case ScraperMorAmber: return ws.ScrapeMorAmberAlerts, ws.MakeMorAmberUrl, nil case ScraperMorCustom: diff --git a/mpp/mpp.go b/mpp/mpp.go index f31b3b3..9e004e4 100644 --- a/mpp/mpp.go +++ b/mpp/mpp.go @@ -1,5 +1,10 @@ package mpp +import ( + "encoding/json" + "time" +) + type State string const ( @@ -73,26 +78,87 @@ const ( ) type MissingPersonPoster struct { - MpName string `json:"mp_name"` - MpHeight uint `json:"mp_height,omitempty"` - MpWeight uint `json:"mp_weight,omitempty"` - MpPhysicalBuild PhysicalBuild `json:"mp_physical_build,omitempty"` - MpComplexion Complexion `json:"mp_complexion,omitempty"` - MpSex Sex `json:"mp_sex,omitempty"` - MpDob string `json:"mp_dob,omitempty"` - MpAgeWhenDisappeared uint `json:"mp_age_when_disappeared,omitempty"` - MpEyesDescription string `json:"mp_eyes_description,omitempty"` - MpHairDescription string `json:"mp_hair_description,omitempty"` - MpOutfitDescription string `json:"mp_outfit_description,omitempty"` - MpIdentifyingCharacteristics string `json:"mp_identifying_characteristics,omitempty"` - CircumstancesBehindDissapearance string `json:"circumstances_behind_dissapearance,omitempty"` - MissingFrom string `json:"missing_from,omitempty"` - MissingDate string `json:"missing_date,omitempty"` - Found bool `json:"found,omitempty"` - AlertType AlertType `json:"alert_type,omitempty"` - PoState State `json:"po_state"` - PoPostUrl string `json:"po_post_url,omitempty"` - PoPostPublicationDate string `json:"po_post_publication_date,omitempty"` - PoPosterUrl string `json:"po_poster_url,omitempty"` - IsMultiple bool `json:"is_multiple,omitempty"` + MpName string + MpHeight uint + MpWeight uint + MpPhysicalBuild PhysicalBuild + MpComplexion Complexion + MpSex Sex + MpDob time.Time + MpAgeWhenDisappeared uint + MpEyesDescription string + MpHairDescription string + MpOutfitDescription string + MpIdentifyingCharacteristics string + CircumstancesBehindDissapearance string + MissingFrom string + MissingDate time.Time + Found bool + AlertType AlertType + PoState State + PoPostUrl string + PoPostPublicationDate time.Time + PoPosterUrl string + IsMultiple bool +} + +func (m MissingPersonPoster) MarshalJSON() ([]byte, error) { + var dob, missingDate, pubDate string + if !m.MpDob.IsZero() { + dob = m.MpDob.Format("2006-01-02") + } + if !m.MissingDate.IsZero() { + missingDate = m.MissingDate.Format("2006-01-02") + } + if !m.PoPostPublicationDate.IsZero() { + pubDate = m.PoPostPublicationDate.Format("2006-01-02") + } + basicMpp := struct { + MpName string `json:"mp_name"` + MpHeight uint `json:"mp_height,omitempty"` + MpWeight uint `json:"mp_weight,omitempty"` + MpPhysicalBuild PhysicalBuild `json:"mp_physical_build,omitempty"` + MpComplexion Complexion `json:"mp_complexion,omitempty"` + MpSex Sex `json:"mp_sex,omitempty"` + MpDob string `json:"mp_dob,omitempty"` + MpAgeWhenDisappeared uint `json:"mp_age_when_disappeared,omitempty"` + MpEyesDescription string `json:"mp_eyes_description,omitempty"` + MpHairDescription string `json:"mp_hair_description,omitempty"` + MpOutfitDescription string `json:"mp_outfit_description,omitempty"` + MpIdentifyingCharacteristics string `json:"mp_identifying_characteristics,omitempty"` + CircumstancesBehindDissapearance string `json:"circumstances_behind_dissapearance,omitempty"` + MissingFrom string `json:"missing_from,omitempty"` + MissingDate string `json:"missing_date,omitempty"` + Found bool `json:"found,omitempty"` + AlertType AlertType `json:"alert_type,omitempty"` + PoState State `json:"po_state"` + PoPostUrl string `json:"po_post_url,omitempty"` + PoPostPublicationDate string `json:"po_post_publication_date,omitempty"` + PoPosterUrl string `json:"po_poster_url,omitempty"` + IsMultiple bool `json:"is_multiple,omitempty"` + }{ + MpName: m.MpName, + MpHeight: m.MpHeight, + MpWeight: m.MpWeight, + MpPhysicalBuild: m.MpPhysicalBuild, + MpComplexion: m.MpComplexion, + MpSex: m.MpSex, + MpDob: dob, + MpAgeWhenDisappeared: m.MpAgeWhenDisappeared, + MpEyesDescription: m.MpEyesDescription, + MpHairDescription: m.MpHairDescription, + MpOutfitDescription: m.MpOutfitDescription, + MpIdentifyingCharacteristics: m.MpIdentifyingCharacteristics, + CircumstancesBehindDissapearance: m.CircumstancesBehindDissapearance, + MissingFrom: m.MissingFrom, + MissingDate: missingDate, + Found: m.Found, + AlertType: m.AlertType, + PoState: m.PoState, + PoPostUrl: m.PoPostUrl, + PoPostPublicationDate: pubDate, + PoPosterUrl: m.PoPosterUrl, + IsMultiple: m.IsMultiple, + } + return json.Marshal(basicMpp) } diff --git a/rawpages/gro/hva-alerts-page.html b/rawpages/gro/hva-alerts-page.html new file mode 100644 index 0000000..46c52cc --- /dev/null +++ b/rawpages/gro/hva-alerts-page.html @@ -0,0 +1,531 @@ + + + + + + + Has Visto A – Fiscalia General del Estado de Guerrero + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ +
+
+ + + + + + + + + + + +
+
+
+
+
+
+
+
+
+

+ HAS VISTO A +

+
+
+ + + + + + +
+
+
+
+
+

JOSE DE JESUS BENITEZ
2022-05-31

JOSE ABUNDIO BOLAÑOS CALVARIO
2022-05-29

JORGE LUIS BASILIO ISIDRO
2022-05-23

JUAN ENRIQUE ORTEGA GANTE
2022-05-23

ELMER ZAHI VAZQUEZ CRISTINO
2022-05-22

SEFERINO CRISTINO VAZQUEZ
2022-05-22

MIGUEL ANGEL GALEANA ARREDONDO
2022-05-21

MIGUEL ANGEL JIMENEZ SAUCEDO
2022-05-18

JUAN CARLOS BAUTISTA PIMENTEL
2022-05-18

CARLOS ALBERTO MOCTEZUMA NAVA
2022-05-17

+
+
+
+
+
+
+
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + diff --git a/ws/gro.go b/ws/gro.go index f1513c5..6239026 100644 --- a/ws/gro.go +++ b/ws/gro.go @@ -10,16 +10,16 @@ import ( "github.com/midir99/rastreadora/mpp" ) -func ParseGroDate(value string) (string, error) { +func ParseGroDate(value string) (time.Time, error) { content := strings.Split(value, "T") if len(content) != 2 { - return "", fmt.Errorf("unable to parse date %s", value) + return time.Time{}, fmt.Errorf("unable to parse date %s", value) } date, err := time.Parse("2006-01-02", content[0]) if err != nil { - return "", fmt.Errorf("unable to parse date %s", value) + return time.Time{}, fmt.Errorf("unable to parse date %s", value) } - return date.Format("2006-01-02"), nil + return date, nil } func ParseGroFound(value string) bool { @@ -78,11 +78,11 @@ func ScrapeGroAlbaAlerts(doc *goquery.Document, client *http.Client) []mpp.Missi if poPostUrl == "" { return } - poPostPublicationDate := s.Find(".entry-date.published").First().AttrOr("datetime", "") - if poPostPublicationDate == "" { - poPostPublicationDate = s.Find(".entry-date").First().AttrOr("datetime", "") + pubDate := s.Find(".entry-date.published").First().AttrOr("datetime", "") + if pubDate == "" { + pubDate = s.Find(".entry-date").First().AttrOr("datetime", "") } - poPostPublicationDate, _ = ParseGroDate(poPostPublicationDate) + poPostPublicationDate, _ := ParseGroDate(pubDate) poPosterUrl := strings.TrimSpace(s.Find("a").AttrOr("data-src", "")) poPosterUrl = strings.Replace(poPosterUrl, "-480x320", "", 1) mpps = append(mpps, mpp.MissingPersonPoster{ @@ -131,11 +131,11 @@ func ScrapeGroAmberAlerts(doc *goquery.Document, client *http.Client) []mpp.Miss if poPostUrl == "" { return } - poPostPublicationDate := s.Find(".entry-date.published").First().AttrOr("datetime", "") - if poPostPublicationDate == "" { - poPostPublicationDate = s.Find(".entry-date").First().AttrOr("datetime", "") + pubDate := s.Find(".entry-date.published").First().AttrOr("datetime", "") + if pubDate == "" { + pubDate = s.Find(".entry-date").First().AttrOr("datetime", "") } - poPostPublicationDate, _ = ParseGroDate(poPostPublicationDate) + poPostPublicationDate, _ := ParseGroDate(pubDate) poPosterUrl := strings.TrimSpace(s.Find("a").AttrOr("data-src", "")) poPosterUrl = strings.Replace(poPosterUrl, "-480x320", "", 1) mpps = append(mpps, mpp.MissingPersonPoster{ @@ -151,3 +151,46 @@ func ScrapeGroAmberAlerts(doc *goquery.Document, client *http.Client) []mpp.Miss }) return mpps } + +func MakeGroHasVistoAAlertsUrl(pageNum uint64) string { + return fmt.Sprintf("https://fiscaliaguerrero.gob.mx/hasvistoa/?pagina=%d", pageNum) +} + +func ScrapeGroHasVistoAAlerts(doc *goquery.Document, client *http.Client) []mpp.MissingPersonPoster { + mpps := []mpp.MissingPersonPoster{} + doc.Find("figure").Each(func(i int, s *goquery.Selection) { + h4Content, err := s.Find("h4").Html() + if err != nil { + return + } + nameAndPubDate := strings.Split(h4Content, "
") + var ( + mpName string + poPostPublicationDate time.Time + ) + if len(nameAndPubDate) == 2 { + mpName = nameAndPubDate[0] + poPostPublicationDate, _ = time.Parse("2006-01-02", nameAndPubDate[1]) + } else { + return + } + poPostUrl := s.Find("a").AttrOr("href", "") + if poPostUrl == "" { + return + } + poPostUrl = "https://fiscaliaguerrero.gob.mx" + poPostUrl + poPosterUrl := s.Find("img").AttrOr("src", "") + if poPosterUrl != "" { + poPosterUrl = "https://fiscaliaguerrero.gob.mx" + poPosterUrl + } + mpps = append(mpps, mpp.MissingPersonPoster{ + AlertType: mpp.AlertTypeHasVistoA, + MpName: mpName, + PoPosterUrl: poPosterUrl, + PoPostPublicationDate: poPostPublicationDate, + PoPostUrl: poPostUrl, + PoState: mpp.StateGuerrero, + }) + }) + return mpps +} diff --git a/ws/mor.go b/ws/mor.go index 1369771..e5716bc 100644 --- a/ws/mor.go +++ b/ws/mor.go @@ -11,10 +11,10 @@ import ( "github.com/midir99/rastreadora/mpp" ) -func ParseMorDate(value string) (string, error) { +func ParseMorDate(value string) (time.Time, error) { date := strings.Split(value, " ") if len(date) != 3 { - return "", fmt.Errorf("unable to parse date %s", value) + return time.Time{}, fmt.Errorf("unable to parse date %s", value) } MONTH_INDEX, DAY_INDEX, YEAR_INDEX := 0, 1, 2 var month time.Month @@ -44,17 +44,17 @@ func ParseMorDate(value string) (string, error) { case "diciembre": month = time.December default: - return "", fmt.Errorf("unable to parse date %s", value) + return time.Time{}, fmt.Errorf("unable to parse date %s", value) } day, err := strconv.Atoi(date[DAY_INDEX]) if err != nil { - return "", fmt.Errorf("unable to parse date %s", value) + return time.Time{}, fmt.Errorf("unable to parse date %s", value) } year, err := strconv.Atoi(date[YEAR_INDEX]) if err != nil { - return "", fmt.Errorf("unable to parse date %s", value) + return time.Time{}, fmt.Errorf("unable to parse date %s", value) } - return time.Date(year, month, day, 0, 0, 0, 0, time.UTC).Format("2006-01-02"), nil + return time.Date(year, month, day, 0, 0, 0, 0, time.UTC), nil } func MakeMorAmberUrl(pageNum uint64) string {