Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix : retry redirect to AlreadyVisitedUrl will loop error #826

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
12 changes: 9 additions & 3 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,10 @@ var collectorCounter uint32
type key int

// ProxyURLKey is the context key for the request proxy address.
const ProxyURLKey key = iota
const (
ProxyURLKey key = iota
CheckRevisitKey
)

var (
// ErrForbiddenDomain is the error thrown if visiting
Expand Down Expand Up @@ -650,7 +653,8 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c
}
// note: once 1.13 is minimum supported Go version,
// replace this with http.NewRequestWithContext
req = req.WithContext(c.Context)
req = req.WithContext(context.WithValue(c.Context, CheckRevisitKey, checkRevisit))

if err := c.requestCheck(parsedURL, method, req.GetBody, depth, checkRevisit); err != nil {
return err
}
Expand Down Expand Up @@ -1382,7 +1386,9 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ
return err
}
if visited {
return &AlreadyVisitedError{req.URL}
if checkRevisit, ok := req.Context().Value(CheckRevisitKey).(bool); !ok || checkRevisit {
return &AlreadyVisitedError{req.URL}
}
}
err = c.store.Visited(uHash)
if err != nil {
Expand Down
22 changes: 22 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1814,3 +1814,25 @@ func TestCollectorPostRetryUnseekable(t *testing.T) {
t.Error("OnResponse Retry was called but BodyUnseekable")
}
}

func TestRedirectErrorRetry(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnError(func(r *Response, err error) {
if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
_ = r.Request.Retry()
return
}
if e := (&AlreadyVisitedError{}); errors.As(err, &e) {
t.Error("loop AlreadyVisitedError")
}

})
c.OnResponse(func(response *Response) {
//println(1)
})
c.Visit(ts.URL + "/redirected/")
c.Visit(ts.URL + "/redirect")
}
Loading