Skip to content

Commit

Permalink
Refactor Harvester to pull jobs (#6)
Browse files Browse the repository at this point in the history
Refactor Harvester to pull jobs
  • Loading branch information
johnerikhalse authored Apr 18, 2018
1 parent 47f08d9 commit 18c537f
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 33 deletions.
38 changes: 38 additions & 0 deletions protobuf/frontier.proto
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,48 @@ import "messages.proto";
// Service for controlling frontier.
service Frontier {
rpc CrawlSeed (CrawlSeedRequest) returns (CrawlExecutionStatus) {}

// Request a URI from the Frontiers queue
// Used by a Harvester to fetch a new page
rpc GetNextPage (stream PageHarvest) returns (stream PageHarvestSpec) {}
}

message CrawlSeedRequest {
string job_execution_id = 1;
CrawlJob job = 5;
Seed seed = 6;
}

// Message sent from Harvester to request a new page to fetch and also used to return the harvest result.
// First message should set requestNextPage to true to tell frontier to respond with a page to fetch.
// When the fetch is done, a stream of PageHarvest objects are returned:
// The first object contains metrics.
// Subsequent objects contain outlinks until all outlinks are sent.
// Finally the client should complete the request.
message PageHarvest {
message Metrics {
// The number of uri's downloaded. The requested uri + embedded resources
int32 uri_count = 1;
// Byte count for the resources downloaded. Includes embedded resources
int64 bytes_downloaded = 2;
}

oneof msg {
// True if this is the initial request to start a new fetch
bool requestNextPage = 1;
// Collected metrics for the page fetched
Metrics metrics = 2;
// The outlinks found in the harvested page
QueuedUri outlink = 3;
// If the overall page fetch failed. Should not be used for a singel uri failure
Error error = 4;
}
}

// A specification of the page to fetch.
message PageHarvestSpec {
// The URI to fetch
QueuedUri queued_uri = 1;
// The configuration for the fetch
CrawlConfig crawl_config = 2;
}
33 changes: 0 additions & 33 deletions protobuf/harvester.proto

This file was deleted.

0 comments on commit 18c537f

Please sign in to comment.