Skip to content

Commit

Permalink
add request statstics
Browse files Browse the repository at this point in the history
  • Loading branch information
lizongying committed Nov 21, 2023
1 parent 8239bd2 commit 3710006
Show file tree
Hide file tree
Showing 45 changed files with 1,022 additions and 289 deletions.
4 changes: 0 additions & 4 deletions README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -1109,10 +1109,6 @@ curl https://github.com/lizongying/go-crawler -x http://localhost:8082 --cacert
* statistics
* panic stop
* extra速率限制
* 没请求完,ctx退出
* request with ctx
* stat->crawler
* test refer cookie
* request context
* item context
* total
Expand Down
11 changes: 1 addition & 10 deletions pkg/api/records.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,8 @@ func (h *RouteRecords) Pattern() string {
return UrlRecords
}

func (h *RouteRecords) ServeHTTP(w http.ResponseWriter, r *http.Request) {
func (h *RouteRecords) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
records := h.crawler.GetStatistics().GetRecords()
//for _, v := range nodes {
// fmt.Println(v)
// bs, err := v.Marshal()
// if err != nil {
// h.OutJson(w, 1, err.Error(), nil)
// return
// }
//
//}
h.OutJson(w, 0, "", records)
}

Expand Down
34 changes: 34 additions & 0 deletions pkg/api/requests.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package api

import (
"github.com/lizongying/go-crawler/pkg"
"net/http"
)

const UrlRequests = "/requests"

type RouteRequests struct {
Request
Response
crawler pkg.Crawler
logger pkg.Logger
}

func (h *RouteRequests) Pattern() string {
return UrlRequests
}

func (h *RouteRequests) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
requests := h.crawler.GetStatistics().GetRequests()
h.OutJson(w, 0, "", requests)
}

func (h *RouteRequests) FromCrawler(crawler pkg.Crawler) pkg.Route {
if h == nil {
return new(RouteRequests).FromCrawler(crawler)
}

h.logger = crawler.GetLogger()
h.crawler = crawler
return h
}
5 changes: 5 additions & 0 deletions pkg/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,15 @@ type ContextRequest interface {
WithStartTime(time.Time) ContextRequest
GetStopTime() time.Time
WithStopTime(time.Time) ContextRequest
GetUpdateTime() time.Time
GetDeadline() time.Time
WithDeadline(time.Time) ContextRequest
GetCookies() map[string]string
WithCookies(map[string]string) ContextRequest
GetReferrer() string
WithReferrer(string) ContextRequest
GetStopReason() string
WithStopReason(stopReason string) ContextRequest
}

type ContextItem interface {
Expand All @@ -139,4 +142,6 @@ type ContextItem interface {
GetUpdateTime() time.Time
GetSaved() bool
WithSaved(bool) ContextItem
GetStopReason() string
WithStopReason(stopReason string) ContextItem
}
1 change: 1 addition & 0 deletions pkg/context/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ type Crawler struct {
StartTime utils.Timestamp `json:"start_time,omitempty"`
StopTime utils.Timestamp `json:"stop_time,omitempty"`
UpdateTime utils.Timestamp `json:"update_time,omitempty"`
StopReason string `json:"stop_reason,omitempty"`
}

func (c *Crawler) GetId() string {
Expand Down
12 changes: 10 additions & 2 deletions pkg/context/item.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type Item struct {
Cookies map[string]string `json:"cookies,omitempty"`
Referrer string `json:"referrer,omitempty"`
Saved bool `json:"saved,omitempty"`
StopReason string `json:"stop_reason,omitempty"`
}

func (c *Item) GetId() string {
Expand All @@ -42,11 +43,11 @@ func (c *Item) WithStatus(status pkg.ItemStatus) pkg.ContextItem {
t := time.Now()
c.withUpdateTime(t)
switch status {
case pkg.ItemStatusPending:
case pkg.ItemStatusRunning:
c.withStartTime(t)
case pkg.ItemStatusSuccess:
c.withStopTime(t)
case pkg.ItemStatusError:
case pkg.ItemStatusFailure:
c.withStopTime(t)
}

Expand Down Expand Up @@ -80,3 +81,10 @@ func (c *Item) WithSaved(saved bool) pkg.ContextItem {
c.Saved = saved
return c
}
func (c *Item) GetStopReason() string {
return c.StopReason
}
func (c *Item) WithStopReason(stopReason string) pkg.ContextItem {
c.StopReason = stopReason
return c
}
43 changes: 35 additions & 8 deletions pkg/context/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@ import (
)

type Request struct {
Context context.Context `json:"-"`
Id string `json:"id,omitempty"`
Status pkg.RequestStatus `json:"status,omitempty"`
StartTime utils.Timestamp `json:"start_time,omitempty"`
StopTime utils.Timestamp `json:"stop_time,omitempty"`
Deadline utils.TimestampNano `json:"deadline,omitempty"`
Cookies map[string]string `json:"cookies,omitempty"`
Referrer string `json:"referrer,omitempty"`
Context context.Context `json:"-"`
Id string `json:"id,omitempty"`
Status pkg.RequestStatus `json:"status,omitempty"`
StartTime utils.Timestamp `json:"start_time,omitempty"`
StopTime utils.Timestamp `json:"stop_time,omitempty"`
UpdateTime utils.Timestamp `json:"update_time,omitempty"`
Deadline utils.TimestampNano `json:"deadline,omitempty"`
Cookies map[string]string `json:"cookies,omitempty"`
Referrer string `json:"referrer,omitempty"`
StopReason string `json:"stop_reason,omitempty"`
}

func (c *Request) GetId() string {
Expand All @@ -37,6 +39,17 @@ func (c *Request) GetStatus() pkg.RequestStatus {
}
func (c *Request) WithStatus(status pkg.RequestStatus) pkg.ContextRequest {
c.Status = status
t := time.Now()
c.withUpdateTime(t)
switch status {
case pkg.RequestStatusRunning:
c.WithStartTime(t)
case pkg.RequestStatusSuccess:
c.WithStopTime(t)
case pkg.RequestStatusFailure:
c.WithStopTime(t)
}

return c
}
func (c *Request) GetStartTime() time.Time {
Expand All @@ -53,6 +66,13 @@ func (c *Request) WithStopTime(stopTime time.Time) pkg.ContextRequest {
c.StopTime = utils.Timestamp{Time: stopTime}
return c
}
func (c *Request) GetUpdateTime() time.Time {
return c.UpdateTime.Time
}
func (c *Request) withUpdateTime(t time.Time) pkg.ContextRequest {
c.UpdateTime = utils.Timestamp{Time: t}
return c
}
func (c *Request) GetDeadline() time.Time {
return c.Deadline.Time
}
Expand All @@ -74,3 +94,10 @@ func (c *Request) WithReferrer(referrer string) pkg.ContextRequest {
c.Referrer = referrer
return c
}
func (c *Request) GetStopReason() string {
return c.StopReason
}
func (c *Request) WithStopReason(stopReason string) pkg.ContextRequest {
c.StopReason = stopReason
return c
}
1 change: 1 addition & 0 deletions pkg/context/spider.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ type Spider struct {
StartTime utils.Timestamp `json:"start_time,omitempty"`
StopTime utils.Timestamp `json:"stop_time,omitempty"`
UpdateTime utils.Timestamp `json:"update_time,omitempty"`
StopReason string `json:"stop_reason,omitempty"`
}

func (c *Spider) GetSpider() pkg.Spider {
Expand Down
2 changes: 1 addition & 1 deletion pkg/context/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func (c *Task) WithStatus(status pkg.TaskStatus) pkg.ContextTask {
c.WithStartTime(t)
case pkg.TaskStatusSuccess:
c.WithStopTime(t)
case pkg.TaskStatusError:
case pkg.TaskStatusFailure:
c.WithStopTime(t)
}

Expand Down
1 change: 1 addition & 0 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ func NewCrawler(spiders []pkg.Spider, cli *cli.Cli, config *config.Config, logge
httpApi.AddRoutes(new(api.RouteSpiders).FromCrawler(crawler))
httpApi.AddRoutes(new(api.RouteJobs).FromCrawler(crawler))
httpApi.AddRoutes(new(api.RouteTasks).FromCrawler(crawler))
httpApi.AddRoutes(new(api.RouteRequests).FromCrawler(crawler))
httpApi.AddRoutes(new(api.RouteRecords).FromCrawler(crawler))
httpApi.AddRoutes(new(api.RouteUser).FromCrawler(crawler))

Expand Down
2 changes: 1 addition & 1 deletion pkg/downloader/downloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,10 @@ func (d *Downloader) FromSpider(spider pkg.Spider) pkg.Downloader {
}

d.spider = spider
d.logger = spider.GetLogger()
d.httpClient = new(http_client.HttpClient).FromSpider(spider)
d.browserManager = new(browser.Manager).FromSpider(spider)
d.middlewares = new(middlewares.Middlewares).FromSpider(spider)
d.logger = spider.GetLogger()

spider.GetCrawler().GetSignal().RegisterSpiderChanged(d.spiderClosed)
return d
Expand Down
10 changes: 7 additions & 3 deletions pkg/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,13 @@ func (e *Exporter) SetPipeline(pipeline pkg.Pipeline, order uint8) {
pipeline.SetName(name)
pipeline.SetOrder(order)
for k, v := range e.pipelines {
if v.Name() == name && v.Order() != order {
e.DelPipeline(k)
break
if v.Name() == name {
if v.Order() == order {
return
} else {
e.DelPipeline(k)
break
}
}
}

Expand Down
8 changes: 4 additions & 4 deletions pkg/item.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,21 +185,21 @@ type ItemStatus uint8
const (
ItemStatusUnknown = iota
ItemStatusPending
ItemStatusDoing
ItemStatusRunning
ItemStatusSuccess
ItemStatusError
ItemStatusFailure
)

func (s *ItemStatus) String() string {
switch *s {
case 1:
return "pending"
case 2:
return "doing"
return "running"
case 3:
return "success"
case 4:
return "error"
return "failure"
default:
return "unknown"
}
Expand Down
5 changes: 0 additions & 5 deletions pkg/items/items.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ import (
"github.com/lizongying/go-crawler/pkg"
)

type ItemWithContext struct {
pkg.Context
pkg.Item
}

type ItemNone struct {
pkg.ItemUnimplemented
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/middlewares/middlewares.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func (m *Middlewares) SetMiddleware(middleware pkg.Middleware, order uint8) {
middleware.SetName(name)
middleware.SetOrder(order)
for k, v := range m.middlewares {
if v.Name() == name && v.Order() != order {
if v.Name() == name {
m.DelMiddleware(k)
break
}
Expand Down
10 changes: 10 additions & 0 deletions pkg/middlewares/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package middlewares
import (
"fmt"
"github.com/lizongying/go-crawler/pkg"
"github.com/lizongying/go-crawler/pkg/utils"
"net/http"
"sort"
)
Expand All @@ -14,6 +15,15 @@ type StatsMiddleware struct {

func (m *StatsMiddleware) taskStopped(c pkg.Context) (err error) {
task := c.GetTask()
if c.GetSpider().GetId() != m.GetSpider().GetContext().GetSpider().GetId() {
return
}
if !utils.InSlice(task.GetStatus(), []pkg.TaskStatus{
pkg.TaskStatusSuccess,
pkg.TaskStatusFailure,
}) {
return
}

var sl []any
sl = append(sl, c.GetSpider().GetName(), c.GetTask().GetId())
Expand Down
3 changes: 0 additions & 3 deletions pkg/pipelines/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ type CsvPipeline struct {
}

func (m *CsvPipeline) ProcessItem(item pkg.Item) (err error) {
spider := m.Spider()
task := item.GetContext().GetTask()
if item == nil {
err = errors.New("nil item")
Expand Down Expand Up @@ -149,8 +148,6 @@ func (m *CsvPipeline) ProcessItem(item pkg.Item) (err error) {
}

m.logger.Info("item saved:", filename)
item.GetContext().GetItem().WithStatus(pkg.ItemStatusSuccess)
spider.GetCrawler().GetSignal().ItemChanged(item)
task.IncItemSuccess()
return
}
Expand Down
3 changes: 0 additions & 3 deletions pkg/pipelines/json_lines.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ type JsonLinesPipeline struct {
}

func (m *JsonLinesPipeline) ProcessItem(item pkg.Item) (err error) {
spider := m.Spider()
task := item.GetContext().GetTask()
if item == nil {
err = errors.New("nil item")
Expand Down Expand Up @@ -106,8 +105,6 @@ func (m *JsonLinesPipeline) ProcessItem(item pkg.Item) (err error) {
}

m.logger.Info("item saved:", filename)
item.GetContext().GetItem().WithStatus(pkg.ItemStatusSuccess)
spider.GetCrawler().GetSignal().ItemChanged(item)
task.IncItemSuccess()
return
}
Expand Down
3 changes: 0 additions & 3 deletions pkg/pipelines/kafka.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ type KafkaPipeline struct {
}

func (m *KafkaPipeline) ProcessItem(item pkg.Item) (err error) {
spider := m.Spider()
task := item.GetContext().GetTask()

if item == nil {
Expand Down Expand Up @@ -91,8 +90,6 @@ func (m *KafkaPipeline) ProcessItem(item pkg.Item) (err error) {
return
}

item.GetContext().GetItem().WithStatus(pkg.ItemStatusSuccess)
spider.GetCrawler().GetSignal().ItemChanged(item)
task.IncItemSuccess()
return
}
Expand Down
3 changes: 0 additions & 3 deletions pkg/pipelines/mongo.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ type MongoPipeline struct {
}

func (m *MongoPipeline) ProcessItem(item pkg.Item) (err error) {
spider := m.Spider()
task := item.GetContext().GetTask()

if item == nil {
Expand Down Expand Up @@ -92,8 +91,6 @@ func (m *MongoPipeline) ProcessItem(item pkg.Item) (err error) {
return
}

item.GetContext().GetItem().WithStatus(pkg.ItemStatusSuccess)
spider.GetCrawler().GetSignal().ItemChanged(item)
task.IncItemSuccess()
return
}
Expand Down
Loading

0 comments on commit 3710006

Please sign in to comment.