Framework for creating a fast distributed crawler system to gather open data.
- Run master node:
sbt master/run
- Run worker nodes:
sbt simple-worker/run sbt simple-worker/run
- СonfigurationСollection
{ "workerExecuteInterval" : "35.seconds", "workerFilePath" : "/", "workerBatchSize" : 2.0, "workerBaseUrl" : "https://habr.com/ru", "workerTaskType" : "HabrTasks", "workerParallelBatches" : 1, "workerResource" : "Tor", "workerNotification" : false }
- TorCollection
{ "workerTorHost" : "127.0.0.1", "workerTorLimit" : 1, "workerTorPort" : 9150, "workerTorControlPort" : 0, "workerTorPassword" : "", "workerTorTimeoutUp" : "30.seconds", "workerTorTimeoutDown" : "30.seconds", "workerTaskType" : [ "HabrTasks" ], "usedCount" : 0 }
- CrawlTasks
{ "taskType" : "HabrTasks", "taskData" : "438886", "taskStatus" : "taskWait", "attempt" : 0 }
resolvers += "Cloud Crawler Repository" at "https://dl.bintray.com/jaitl/cloud-crawler",
libraryDependencies += "com.github.jaitl.crawler" %% "worker" % version
repo:
repositories {
maven {
url "https://dl.bintray.com/jaitl/cloud-crawler"
}
}
dependency:
compile 'com.github.jaitl.crawler:worker_2.13:version'