Skip to content

Commit

Permalink
support for HTTP/HTTPS/SOCKS5 proxies (#94)
Browse files Browse the repository at this point in the history
  • Loading branch information
gosom authored Nov 2, 2024
1 parent 49d9d5b commit 1ea5769
Show file tree
Hide file tree
Showing 11 changed files with 35 additions and 64 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
APP_NAME := google_maps_scraper
VERSION := 1.5.2
VERSION := 1.5.3

default: help

Expand Down
13 changes: 3 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@

![Google maps scraper](https://github.com/gosom/google-maps-scraper/blob/main/banner.png)

## 🚀 Please [vote](https://github.com/gosom/google-maps-scraper/discussions/61) for the next features

A command line and web based google maps scraper build using

[scrapemate](https://github.com/gosom/scrapemate) web crawling framework.
Expand All @@ -47,7 +45,6 @@ You can use this repository either as is, or you can use it's code as a base and
customize it to your needs



## Try it

### Web UI:
Expand Down Expand Up @@ -91,7 +88,7 @@ Your support helps ensure continued improvement and maintenance.
- Dockerized for easy run in multiple platforms
- Scalable in multiple machines
- Optionally extracts emails from the website of the business
- SOCKS5 proxy support
- SOCKS5/HTTP/HTTPS proxy support

## Notes on email extraction

Expand Down Expand Up @@ -192,7 +189,7 @@ try `./google-maps-scraper -h` to see the command line options available:

```
-c int
sets the concurrency [default: half of CPU cores] (default 8)
sets the concurrency [default: half of CPU cores] (default 11)
-cache string
sets the cache directory [no effect at the moment] (default "cache")
-data-folder string
Expand All @@ -218,11 +215,7 @@ try `./google-maps-scraper -h` to see the command line options available:
-produce
produce seed jobs only (requires dsn)
-proxies string
comma separated list of proxies to use
-proxy-password string
password for proxy authentication
-proxy-username string
username for proxy authentication
comma separated list of proxies to use in the format protocol://user:pass@host:port example: socks5://localhost:9050 or http://user:pass@localhost:9050
-results string
path to the results file [default: stdout] (default "stdout")
-web
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/PuerkitoBio/goquery v1.10.0
github.com/golangci/golangci-lint v1.61.0
github.com/google/uuid v1.6.0
github.com/gosom/scrapemate v0.7.1
github.com/gosom/scrapemate v0.8.0
github.com/jackc/pgx/v5 v5.7.1
github.com/mattn/go-runewidth v0.0.16
github.com/mcnijman/go-emailaddress v1.1.1
Expand Down Expand Up @@ -234,4 +234,4 @@ require (
mvdan.cc/unparam v0.0.0-20240528143540-8a5130ca722f // indirect
)

//replace github.com/gosom/scrapemate v0.6.0 => ../scrapemate
// replace github.com/gosom/scrapemate v0.7.1 => ../scrapemate
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,8 @@ github.com/gordonklaus/ineffassign v0.1.0 h1:y2Gd/9I7MdY1oEIt+n+rowjBNDcLQq3RsH5
github.com/gordonklaus/ineffassign v0.1.0/go.mod h1:Qcp2HIAYhR7mNUVSIxZww3Guk4it82ghYcEXIAk+QT0=
github.com/gosom/kit v0.0.0-20230309082109-543b32ac686a h1:5tcB33GTXm0pFUiEFpmE91tMsHQj+I+W7zubT8J/ugI=
github.com/gosom/kit v0.0.0-20230309082109-543b32ac686a/go.mod h1:ngnWSsuBEpCA5Y43kZRa3x8RBYZZ4LDtvZHO4N5dHZ0=
github.com/gosom/scrapemate v0.7.1 h1:B4ll2lSw4gyzr0BiVcuKu6FQJSPh8Coc1Qg1VxfrtZE=
github.com/gosom/scrapemate v0.7.1/go.mod h1:0EuH67Lz16HlyxQfoSOY46zpLNq/75/qlarYstMPHiQ=
github.com/gosom/scrapemate v0.8.0 h1:rA6wTHmekiWMok79v02L9VYgEQaAXHAa3KdrzSeeAn4=
github.com/gosom/scrapemate v0.8.0/go.mod h1:0EuH67Lz16HlyxQfoSOY46zpLNq/75/qlarYstMPHiQ=
github.com/gostaticanalysis/analysisutil v0.7.1 h1:ZMCjoue3DtDWQ5WyU16YbjbQEQ3VuzwxALrpYd+HeKk=
github.com/gostaticanalysis/analysisutil v0.7.1/go.mod h1:v21E3hY37WKMGSnbsw2S/ojApNWb6C1//mXO48CXbVc=
github.com/gostaticanalysis/comment v1.4.1/go.mod h1:ih6ZxzTHLdadaiSnF5WY3dxUoXfXAlTaRzuaNDlSado=
Expand Down
2 changes: 0 additions & 2 deletions runner/databaserunner/databaserunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ func New(cfg *runner.Config) (runner.Runner, error) {
if len(cfg.Proxies) > 0 {
opts = append(opts,
scrapemateapp.WithProxies(cfg.Proxies),
scrapemateapp.WithProxyUsername(cfg.ProxyUsername),
scrapemateapp.WithProxyPassword(cfg.ProxyPassword),
)
}

Expand Down
2 changes: 0 additions & 2 deletions runner/filerunner/filerunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,6 @@ func (r *fileRunner) setApp() error {
if len(r.cfg.Proxies) > 0 {
opts = append(opts,
scrapemateapp.WithProxies(r.cfg.Proxies),
scrapemateapp.WithProxyUsername(r.cfg.ProxyUsername),
scrapemateapp.WithProxyPassword(r.cfg.ProxyPassword),
)
}

Expand Down
6 changes: 1 addition & 5 deletions runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ type Config struct {
WebRunner bool
DataFolder string
Proxies []string
ProxyUsername string
ProxyPassword string
}

func ParseConfig() *Config {
Expand Down Expand Up @@ -89,9 +87,7 @@ func ParseConfig() *Config {
flag.IntVar(&cfg.Zoom, "zoom", 0, "set zoom level (0-21) for search")
flag.BoolVar(&cfg.WebRunner, "web", false, "run web server instead of crawling")
flag.StringVar(&cfg.DataFolder, "data-folder", "webdata", "data folder for web runner")
flag.StringVar(&proxies, "proxies", "", "comma separated list of proxies to use")
flag.StringVar(&cfg.ProxyUsername, "proxy-username", "", "username for proxy authentication")
flag.StringVar(&cfg.ProxyPassword, "proxy-password", "", "password for proxy authentication")
flag.StringVar(&proxies, "proxies", "", "comma separated list of proxies to use in the format protocol://user:pass@host:port example: socks5://localhost:9050 or http://user:pass@localhost:9050")

flag.Parse()

Expand Down
7 changes: 1 addition & 6 deletions runner/webrunner/webrunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,16 +250,11 @@ func (w *webrunner) setupMate(_ context.Context, writer io.Writer, job *web.Job)
hasProxy := false

if len(w.cfg.Proxies) > 0 {
opts = append(opts, scrapemateapp.WithProxies(w.cfg.Proxies),
scrapemateapp.WithProxyUsername(w.cfg.ProxyUsername),
scrapemateapp.WithProxyPassword(w.cfg.ProxyPassword),
)
opts = append(opts, scrapemateapp.WithProxies(w.cfg.Proxies))
hasProxy = true
} else if len(job.Data.Proxies) > 0 {
opts = append(opts,
scrapemateapp.WithProxies(job.Data.Proxies),
scrapemateapp.WithProxyUsername(job.Data.ProxyUsername),
scrapemateapp.WithProxyPassword(job.Data.ProxyPassword),
)
hasProxy = true
}
Expand Down
20 changes: 9 additions & 11 deletions web/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,15 @@ func (j *Job) Validate() error {
}

type JobData struct {
Keywords []string `json:"keywords"`
Lang string `json:"lang"`
Zoom int `json:"zoom"`
Lat string `json:"lat"`
Lon string `json:"lon"`
Depth int `json:"depth"`
Email bool `json:"email"`
MaxTime time.Duration `json:"max_time"`
Proxies []string `json:"proxies"`
ProxyUsername string `json:"proxy_username"`
ProxyPassword string `json:"proxy_password"`
Keywords []string `json:"keywords"`
Lang string `json:"lang"`
Zoom int `json:"zoom"`
Lat string `json:"lat"`
Lon string `json:"lon"`
Depth int `json:"depth"`
Email bool `json:"email"`
MaxTime time.Duration `json:"max_time"`
Proxies []string `json:"proxies"`
}

func (d *JobData) Validate() error {
Expand Down
14 changes: 6 additions & 8 deletions web/static/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,14 @@ <h1>Google Maps Scraper</h1>
<fieldset>
<div class="form-group">
<label for="proxies">Proxies:(one per line)</label>
<p class="text-muted"><small>Examples:<br>
<p>HTTPS proxy with username/password: https://username:[email protected]:443<p>
<p>HTTP proxy with username/password: http://username:[email protected]:443<p>
<p>SOCKS5 proxy without auth: socks5://127.0.0.1:8000</p>
</p>

<textarea id="proxies" name="proxies" rows="5">{{.ProxiesString}}</textarea>
</div>
<div class="form-group checkbox">
<label for="proxyusername">Username</label>
<input type="text" id="proxyusername" name="proxyusername" value="{{.ProxyUsername}}">
</div>
<div class="form-group">
<label for="proxypassword">Password:</label>
<input type="password" id="proxypassword" name="proxypassword" value="{{.ProxyPassword}}">
</div>
</fieldset>
</details>

Expand Down
25 changes: 10 additions & 15 deletions web/web.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,16 @@ func (s *Server) Start(ctx context.Context) error {
}

type formData struct {
Name string
MaxTime string
Keywords []string
Language string
Zoom int
Lat string
Lon string
Depth int
Email bool
Proxies []string
ProxyUsername string
ProxyPassword string
Name string
MaxTime string
Keywords []string
Language string
Zoom int
Lat string
Lon string
Depth int
Email bool
Proxies []string
}

//nolint:gocritic // this is used in template
Expand Down Expand Up @@ -239,9 +237,6 @@ func (s *Server) scrape(w http.ResponseWriter, r *http.Request) {
}
}

newJob.Data.ProxyUsername = r.Form.Get("proxyusername")
newJob.Data.ProxyPassword = r.Form.Get("proxypassword")

err = newJob.Validate()
if err != nil {
http.Error(w, err.Error(), http.StatusUnprocessableEntity)
Expand Down

0 comments on commit 1ea5769

Please sign in to comment.