Skip to content

Commit

Permalink
adds deduplicator to avoid duplicate urls scrape
Browse files Browse the repository at this point in the history
  • Loading branch information
gosom committed Oct 25, 2024
1 parent aaa999e commit 3b3a32e
Show file tree
Hide file tree
Showing 9 changed files with 346 additions and 130 deletions.
17 changes: 17 additions & 0 deletions deduper/deduper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package deduper

import (
"context"
"sync"
)

type Deduper interface {
AddIfNotExists(context.Context, string) bool
}

func New() Deduper {
return &hashmap{
seen: make(map[uint64]struct{}),
mux: &sync.RWMutex{},
}
}
42 changes: 42 additions & 0 deletions deduper/hashmap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package deduper

import (
"context"
"hash/fnv"
"sync"
)

var _ Deduper = (*hashmap)(nil)

type hashmap struct {
mux *sync.RWMutex
seen map[uint64]struct{}
}

func (d *hashmap) AddIfNotExists(_ context.Context, key string) bool {
d.mux.RLock()
if _, ok := d.seen[d.hash(key)]; ok {
d.mux.RUnlock()
return false
}

d.mux.RUnlock()

d.mux.Lock()
defer d.mux.Unlock()

if _, ok := d.seen[d.hash(key)]; ok {
return false
}

d.seen[d.hash(key)] = struct{}{}

return true
}

func (d *hashmap) hash(key string) uint64 {
h := fnv.New64()
h.Write([]byte(key))

return h.Sum64()
}
29 changes: 27 additions & 2 deletions gmaps/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,31 @@ import (

"github.com/PuerkitoBio/goquery"
"github.com/google/uuid"
"github.com/gosom/google-maps-scraper/deduper"
"github.com/gosom/scrapemate"
"github.com/playwright-community/playwright-go"
)

type GmapJobOptions func(*GmapJob)

type GmapJob struct {
scrapemate.Job

MaxDepth int
LangCode string
ExtractEmail bool

Deduper deduper.Deduper
}

func NewGmapJob(id, langCode, query string, maxDepth int, extractEmail bool, geoCoordinates string, zoom int) *GmapJob {
func NewGmapJob(
id, langCode, query string,
maxDepth int,
extractEmail bool,
geoCoordinates string,
zoom int,
opts ...GmapJobOptions,
) *GmapJob {
query = url.QueryEscape(query)

const (
Expand Down Expand Up @@ -55,9 +67,19 @@ func NewGmapJob(id, langCode, query string, maxDepth int, extractEmail bool, geo
ExtractEmail: extractEmail,
}

for _, opt := range opts {
opt(&job)
}

return &job
}

func WithDeduper(d deduper.Deduper) GmapJobOptions {
return func(j *GmapJob) {
j.Deduper = d
}
}

func (j *GmapJob) UseInResults() bool {
return false
}
Expand All @@ -84,7 +106,10 @@ func (j *GmapJob) Process(ctx context.Context, resp *scrapemate.Response) (any,
doc.Find(`div[role=feed] div[jsaction]>a`).Each(func(_ int, s *goquery.Selection) {
if href := s.AttrOr("href", ""); href != "" {
nextJob := NewPlaceJob(j.ID, j.LangCode, href, j.ExtractEmail)
next = append(next, nextJob)

if j.Deduper == nil || j.Deduper.AddIfNotExists(ctx, href) {
next = append(next, nextJob)
}
}
})
}
Expand Down
233 changes: 233 additions & 0 deletions go.work.sum

Large diffs are not rendered by default.

126 changes: 0 additions & 126 deletions results.csv

This file was deleted.

1 change: 1 addition & 0 deletions runner/databaserunner/databaserunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ func (d *dbrunner) produceSeedJobs(ctx context.Context) error {
d.cfg.Email,
d.cfg.GeoCoordinates,
d.cfg.Zoom,
nil,
)
if err != nil {
return err
Expand Down
4 changes: 4 additions & 0 deletions runner/filerunner/filerunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"strings"
"time"

"github.com/gosom/google-maps-scraper/deduper"
"github.com/gosom/google-maps-scraper/runner"
"github.com/gosom/google-maps-scraper/tlmt"
"github.com/gosom/scrapemate"
Expand Down Expand Up @@ -70,13 +71,16 @@ func (r *fileRunner) Run(ctx context.Context) (err error) {
_ = runner.Telemetry().Send(ctx, evt)
}()

dedup := deduper.New()

seedJobs, err = runner.CreateSeedJobs(
r.cfg.LangCode,
r.input,
r.cfg.MaxDepth,
r.cfg.Email,
r.cfg.GeoCoordinates,
r.cfg.Zoom,
dedup,
)
if err != nil {
return err
Expand Down
20 changes: 18 additions & 2 deletions runner/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,20 @@ import (
"plugin"
"strings"

"github.com/gosom/google-maps-scraper/deduper"
"github.com/gosom/google-maps-scraper/gmaps"
"github.com/gosom/scrapemate"
)

func CreateSeedJobs(langCode string, r io.Reader, maxDepth int, email bool, geoCoordinates string, zoom int) (jobs []scrapemate.IJob, err error) {
func CreateSeedJobs(
langCode string,
r io.Reader,
maxDepth int,
email bool,
geoCoordinates string,
zoom int,
dedup deduper.Deduper,
) (jobs []scrapemate.IJob, err error) {
scanner := bufio.NewScanner(r)

for scanner.Scan() {
Expand All @@ -29,7 +38,14 @@ func CreateSeedJobs(langCode string, r io.Reader, maxDepth int, email bool, geoC
id = strings.TrimSpace(after)
}

jobs = append(jobs, gmaps.NewGmapJob(id, langCode, query, maxDepth, email, geoCoordinates, zoom))
opts := []gmaps.GmapJobOptions{}
if dedup != nil {
opts = append(opts, gmaps.WithDeduper(dedup))
}

job := gmaps.NewGmapJob(id, langCode, query, maxDepth, email, geoCoordinates, zoom, opts...)

jobs = append(jobs, job)
}

return jobs, scanner.Err()
Expand Down
4 changes: 4 additions & 0 deletions runner/webrunner/webrunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"strings"
"time"

"github.com/gosom/google-maps-scraper/deduper"
"github.com/gosom/google-maps-scraper/runner"
"github.com/gosom/google-maps-scraper/tlmt"
"github.com/gosom/google-maps-scraper/web"
Expand Down Expand Up @@ -172,13 +173,16 @@ func (w *webrunner) scrapeJob(ctx context.Context, job *web.Job) error {
coords = job.Data.Lat + "," + job.Data.Lon
}

dedup := deduper.New()

seedJobs, err := runner.CreateSeedJobs(
job.Data.Lang,
strings.NewReader(strings.Join(job.Data.Keywords, "\n")),
job.Data.Depth,
job.Data.Email,
coords,
job.Data.Zoom,
dedup,
)
if err != nil {
err2 := w.svc.Update(ctx, job)
Expand Down

0 comments on commit 3b3a32e

Please sign in to comment.