Skip to content

Commit

Permalink
Exits the scraper smarter when all jobs are done
Browse files Browse the repository at this point in the history
  • Loading branch information
gosom committed Oct 25, 2024
1 parent f22cdb7 commit e5f2dfb
Show file tree
Hide file tree
Showing 8 changed files with 201 additions and 7 deletions.
106 changes: 106 additions & 0 deletions exiter/exiter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package exiter

import (
"context"
"fmt"
"sync"
"time"
)

type Exiter interface {
SetSeedCount(int)
SetCancelFunc(context.CancelFunc)
IncrSeedCompleted(int)
IncrPlacesFound(int)
IncrPlacesCompleted(int)
Run(context.Context)
}

type exiter struct {
seedCount int
seedCompleted int
placesFound int
placesCompleted int

mu *sync.Mutex
cancelFunc context.CancelFunc
}

func New() Exiter {
return &exiter{
mu: &sync.Mutex{},
}
}

func (e *exiter) SetSeedCount(val int) {
e.mu.Lock()
defer e.mu.Unlock()

e.seedCount = val
}

func (e *exiter) SetCancelFunc(fn context.CancelFunc) {
e.mu.Lock()
defer e.mu.Unlock()

e.cancelFunc = fn
}

func (e *exiter) IncrSeedCompleted(val int) {
e.mu.Lock()
defer e.mu.Unlock()

e.seedCompleted += val
}

func (e *exiter) IncrPlacesFound(val int) {
e.mu.Lock()
defer e.mu.Unlock()

e.placesFound += val
}

func (e *exiter) IncrPlacesCompleted(val int) {
e.mu.Lock()
defer e.mu.Unlock()

e.placesCompleted += val
}

func (e *exiter) Run(ctx context.Context) {
ticker := time.NewTicker(time.Second * 5)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if e.isDone() {
e.cancelFunc()

return
}
}
}
}

func (e *exiter) isDone() bool {
e.mu.Lock()
defer e.mu.Unlock()

fmt.Println("seedCount:", e.seedCount)
fmt.Println("seedCompleted:", e.seedCompleted)
fmt.Println("placesFound:", e.placesFound)
fmt.Println("placesCompleted:", e.placesCompleted)

if e.seedCompleted != e.seedCount {
return false
}

if e.placesFound != e.placesCompleted {
return false
}

return true
}
24 changes: 22 additions & 2 deletions gmaps/emailjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,21 @@ import (
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/gosom/google-maps-scraper/exiter"
"github.com/gosom/scrapemate"
"github.com/mcnijman/go-emailaddress"
)

type EmailExtractJobOptions func(*EmailExtractJob)

type EmailExtractJob struct {
scrapemate.Job

Entry *Entry
Entry *Entry
ExitMonitor exiter.Exiter
}

func NewEmailJob(parentID string, entry *Entry) *EmailExtractJob {
func NewEmailJob(parentID string, entry *Entry, opts ...EmailExtractJobOptions) *EmailExtractJob {
const (
defaultPrio = scrapemate.PriorityHigh
defaultMaxRetries = 0
Expand All @@ -33,15 +37,31 @@ func NewEmailJob(parentID string, entry *Entry) *EmailExtractJob {

job.Entry = entry

for _, opt := range opts {
opt(&job)
}

return &job
}

func WithEmailJobExitMonitor(exitMonitor exiter.Exiter) EmailExtractJobOptions {
return func(j *EmailExtractJob) {
j.ExitMonitor = exitMonitor
}
}

func (j *EmailExtractJob) Process(ctx context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) {
defer func() {
resp.Document = nil
resp.Body = nil
}()

defer func() {
if j.ExitMonitor != nil {
j.ExitMonitor.IncrPlacesCompleted(1)
}
}()

log := scrapemate.GetLoggerFromContext(ctx)

log.Info("Processing email job", "url", j.URL)
Expand Down
22 changes: 20 additions & 2 deletions gmaps/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/google/uuid"
"github.com/gosom/google-maps-scraper/deduper"
"github.com/gosom/google-maps-scraper/exiter"
"github.com/gosom/scrapemate"
"github.com/playwright-community/playwright-go"
)
Expand All @@ -23,7 +24,8 @@ type GmapJob struct {
LangCode string
ExtractEmail bool

Deduper deduper.Deduper
Deduper deduper.Deduper
ExitMonitor exiter.Exiter
}

func NewGmapJob(
Expand Down Expand Up @@ -80,6 +82,12 @@ func WithDeduper(d deduper.Deduper) GmapJobOptions {
}
}

func WithExitMonitor(e exiter.Exiter) GmapJobOptions {
return func(j *GmapJob) {
j.ExitMonitor = e
}
}

func (j *GmapJob) UseInResults() bool {
return false
}
Expand All @@ -105,7 +113,12 @@ func (j *GmapJob) Process(ctx context.Context, resp *scrapemate.Response) (any,
} else {
doc.Find(`div[role=feed] div[jsaction]>a`).Each(func(_ int, s *goquery.Selection) {
if href := s.AttrOr("href", ""); href != "" {
nextJob := NewPlaceJob(j.ID, j.LangCode, href, j.ExtractEmail)
jopts := []PlaceJobOptions{}
if j.ExitMonitor != nil {
jopts = append(jopts, WithPlaceJobExitMonitor(j.ExitMonitor))
}

nextJob := NewPlaceJob(j.ID, j.LangCode, href, j.ExtractEmail, jopts...)

if j.Deduper == nil || j.Deduper.AddIfNotExists(ctx, href) {
next = append(next, nextJob)
Expand All @@ -114,6 +127,11 @@ func (j *GmapJob) Process(ctx context.Context, resp *scrapemate.Response) (any,
})
}

if j.ExitMonitor != nil {
j.ExitMonitor.IncrPlacesFound(len(next))
j.ExitMonitor.IncrSeedCompleted(1)
}

log.Info(fmt.Sprintf("%d places found", len(next)))

return nil, next, nil
Expand Down
25 changes: 23 additions & 2 deletions gmaps/place.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,22 @@ import (
"strings"

"github.com/google/uuid"
"github.com/gosom/google-maps-scraper/exiter"
"github.com/gosom/scrapemate"
"github.com/playwright-community/playwright-go"
)

type PlaceJobOptions func(*PlaceJob)

type PlaceJob struct {
scrapemate.Job

UsageInResultststs bool
ExtractEmail bool
ExitMonitor exiter.Exiter
}

func NewPlaceJob(parentID, langCode, u string, extractEmail bool) *PlaceJob {
func NewPlaceJob(parentID, langCode, u string, extractEmail bool, opts ...PlaceJobOptions) *PlaceJob {
const (
defaultPrio = scrapemate.PriorityMedium
defaultMaxRetries = 3
Expand All @@ -39,9 +43,19 @@ func NewPlaceJob(parentID, langCode, u string, extractEmail bool) *PlaceJob {
job.UsageInResultststs = true
job.ExtractEmail = extractEmail

for _, opt := range opts {
opt(&job)
}

return &job
}

func WithPlaceJobExitMonitor(exitMonitor exiter.Exiter) PlaceJobOptions {
return func(j *PlaceJob) {
j.ExitMonitor = exitMonitor
}
}

func (j *PlaceJob) Process(_ context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) {
defer func() {
resp.Document = nil
Expand All @@ -66,11 +80,18 @@ func (j *PlaceJob) Process(_ context.Context, resp *scrapemate.Response) (any, [
}

if j.ExtractEmail && entry.IsWebsiteValidForEmail() {
emailJob := NewEmailJob(j.ID, &entry)
opts := []EmailExtractJobOptions{}
if j.ExitMonitor != nil {
opts = append(opts, WithEmailJobExitMonitor(j.ExitMonitor))
}

emailJob := NewEmailJob(j.ID, &entry, opts...)

j.UsageInResultststs = false

return nil, []scrapemate.IJob{emailJob}, nil
} else if j.ExitMonitor != nil {
j.ExitMonitor.IncrPlacesCompleted(1)
}

return &entry, nil, err
Expand Down
1 change: 1 addition & 0 deletions runner/databaserunner/databaserunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ func (d *dbrunner) produceSeedJobs(ctx context.Context) error {
d.cfg.GeoCoordinates,
d.cfg.Zoom,
nil,
nil,
)
if err != nil {
return err
Expand Down
12 changes: 12 additions & 0 deletions runner/filerunner/filerunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

"github.com/gosom/google-maps-scraper/deduper"
"github.com/gosom/google-maps-scraper/exiter"
"github.com/gosom/google-maps-scraper/runner"
"github.com/gosom/google-maps-scraper/tlmt"
"github.com/gosom/scrapemate"
Expand Down Expand Up @@ -72,6 +73,7 @@ func (r *fileRunner) Run(ctx context.Context) (err error) {
}()

dedup := deduper.New()
exitMonitor := exiter.New()

seedJobs, err = runner.CreateSeedJobs(
r.cfg.LangCode,
Expand All @@ -81,11 +83,21 @@ func (r *fileRunner) Run(ctx context.Context) (err error) {
r.cfg.GeoCoordinates,
r.cfg.Zoom,
dedup,
exitMonitor,
)
if err != nil {
return err
}

exitMonitor.SetSeedCount(len(seedJobs))

ctx, cancel := context.WithCancel(ctx)
defer cancel()

exitMonitor.SetCancelFunc(cancel)

go exitMonitor.Run(ctx)

err = r.app.Start(ctx, seedJobs...)

return err
Expand Down
7 changes: 7 additions & 0 deletions runner/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"strings"

"github.com/gosom/google-maps-scraper/deduper"
"github.com/gosom/google-maps-scraper/exiter"
"github.com/gosom/google-maps-scraper/gmaps"
"github.com/gosom/scrapemate"
)
Expand All @@ -22,6 +23,7 @@ func CreateSeedJobs(
geoCoordinates string,
zoom int,
dedup deduper.Deduper,
exitMonitor exiter.Exiter,
) (jobs []scrapemate.IJob, err error) {
scanner := bufio.NewScanner(r)

Expand All @@ -39,10 +41,15 @@ func CreateSeedJobs(
}

opts := []gmaps.GmapJobOptions{}

if dedup != nil {
opts = append(opts, gmaps.WithDeduper(dedup))
}

if exitMonitor != nil {
opts = append(opts, gmaps.WithExitMonitor(exitMonitor))
}

job := gmaps.NewGmapJob(id, langCode, query, maxDepth, email, geoCoordinates, zoom, opts...)

jobs = append(jobs, job)
Expand Down
Loading

0 comments on commit e5f2dfb

Please sign in to comment.