Skip to content

Commit

Permalink
Merge pull request #7 from galaxiat/dev
Browse files Browse the repository at this point in the history
Added : Path error log + config option to limit number of parallel crawling & Cleanup
  • Loading branch information
warstrolo authored Apr 28, 2022
2 parents b42e78c + f6da587 commit e4bee6c
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 26 deletions.
5 changes: 4 additions & 1 deletion .galaxiat.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,8 @@
"json_url" : "https://api.galaxiatapp.com/seo/galaxiat.json",
"cron" : "0 */15 * * * *"
}
]
],
"crawl_cron" : "* * * * * *",
"crawl_max_num" : 3,
"crawl_queue_num" : 10
}
46 changes: 32 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,26 +38,32 @@ npm install galaxiat.serve.seo
---
`.galaxiat.json` OR `.galaxiat.{env}.json`

To set env use the `GALAXIAT_SERVE_ENV` var

```json
{
"hostname": "galaxiatapp.com",
"port": 3000,
"args": ["--no-sandbox", "--disable-setuid-sandbox"],
"target": "https://galaxiatapp.com",
"public": "./public",
"crawl": [
"hostname" : "galaxiatapp.com",
"port" : 3000,
"args" : ["--no-sandbox",
"--disable-setuid-sandbox"],
"target" : "http://localhost:3000",
"public" : "./public",
"crawl" : [
{
"type": "config",
"url": "/path",
"file": "/cache/path.html",
"cron": "*/10 * * * * *"
"type" : "config",
"url" : "/path",
"file" : "/cache/path.html",
"cron" : "0 * * * * *"
},
{
"type": "remote",
"json_url": "https://api.galaxiatapp.com/seo/galaxiat.json",
"cron": "0 */5 * * * *"
"type" : "remote",
"json_url" : "https://api.galaxiatapp.com/seo/galaxiat.json",
"cron" : "0 */15 * * * *"
}
]
],
"crawl_cron" : "* * * * * *",
"crawl_max_num" : 3,
"crawl_queue_num" : 10
}
```
---
Expand All @@ -75,6 +81,18 @@ npm install galaxiat.serve.seo
]
```

## RoadMap

- `V1.X.X` - Single workload implementation
- Per node deployment -> not so good for performance
- Crawling is done on the local node
- `V2.X.X` - Multiple workload implementation
- Multi-node deployment -> better performance
- Crawling is done on remote node
- `V3.X.X` - Advanced Multiple workload implementation
- Multi-node deployment + Cluster cache -> better performance
- Cache is cluster wide instead of a local cache per node

## Links

- [Galaxiat](https://galaxiatapp.com/)
Expand Down
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
"serve-handler": "^6.1.3",
"typescript": "^4.6.3"
},
"files": [
"/dist"
],
"bin": {
"galaxiat.serve.seo": "./dist/index.js"
},
Expand Down
3 changes: 2 additions & 1 deletion src/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ export async function Crawl(browser: Browser, crawl_infos: crawl, config: config
})
writeFileSync(`${config.public}/${crawl_infos.file}`, ctn)
} catch (e) {
console.log(e)

console.log(`${crawl_infos.url} -> ${e}`)
}
if (!page.isClosed()) {
await page.close()
Expand Down
22 changes: 13 additions & 9 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,25 @@ const config: config_type = JSON.parse(readFileSync(config_location).toString())
});
})

const browser = await puppeteer.launch({ headless: true, args: config.args });
const browser = await puppeteer.launch({ headless: false, args: config.args });
const borwserPID = browser.process();
let queue = new Stack()

let httpserv = server.listen(config.port, () => {
if (config["crawl"] != undefined) {

new Cron("*/15 * * * * *", async () => {
console.log(queue.count())
for (const entry of queue.get(10)) {

let curr_crawl_num = 0
new Cron(config.crawl_cron, async () => {
console.log("targets :", browser.targets().length)
console.log("crawl_num :", curr_crawl_num)
console.log("queue count : ",queue.count())
if ((curr_crawl_num < config.crawl_max_num)) {
curr_crawl_num++
for (const entry of queue.get(config.crawl_queue_num)) {
await Crawl(browser, entry, config)

}
curr_crawl_num--
}
console.log(browser.targets().length)

})
for (const entry of (config["crawl"] as crawl_urls_cron[])) {
if (entry.type == "config") {
Expand Down Expand Up @@ -80,7 +84,7 @@ export class Stack {
return this.list.indexOf(item) === -1 ? this.list.push(item) : this.list.length
}
public get(num: number): crawl[] {
let array : crawl[] = []
let array: crawl[] = []
for (let i = 0; i < num; i++) {
let item = this.list.shift()
if (item) {
Expand Down
5 changes: 4 additions & 1 deletion src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ export type config_type = {
port: number,
target: string,
public: string,
crawl: crawl_urls_cron[]
crawl: crawl_urls_cron[],
crawl_cron : string
crawl_queue_num : number
crawl_max_num : number
}

export type crawl = {
Expand Down

0 comments on commit e4bee6c

Please sign in to comment.