Skip to content

Commit

Permalink
Merge pull request #148 from kaibadash/issues/147
Browse files Browse the repository at this point in the history
fix: #147 Set `purgeOnStart: true` to process multiple sites as a server
  • Loading branch information
marcelovicentegc authored Feb 26, 2024
2 parents 6a417bf + 5a2a565 commit 892cd9d
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 76 deletions.
6 changes: 2 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
# [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15)


### Bug Fixes

* linting ([0f4e58b](https://github.com/BuilderIO/gpt-crawler/commit/0f4e58b400eab312e7b595d7a2472bae93055415))

- linting ([0f4e58b](https://github.com/BuilderIO/gpt-crawler/commit/0f4e58b400eab312e7b595d7a2472bae93055415))

### Features

* add server api readme docs ([717e625](https://github.com/BuilderIO/gpt-crawler/commit/717e625f47257bdbd96437acb7242bcd28c233ba))
- add server api readme docs ([717e625](https://github.com/BuilderIO/gpt-crawler/commit/717e625f47257bdbd96437acb7242bcd28c233ba))

# [1.3.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.2.1...v1.3.0) (2024-01-06)

Expand Down
150 changes: 78 additions & 72 deletions src/core.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// For more information, see https://crawlee.dev/
import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee";
import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import { Config, configSchema } from "./config.js";
Expand Down Expand Up @@ -54,83 +54,89 @@ export async function crawl(config: Config) {
if (process.env.NO_CRAWL !== "true") {
// PlaywrightCrawler crawls the web using a headless
// browser controlled by the Playwright library.
crawler = new PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
const title = await page.title();
pageCounter++;
log.info(
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
);

// Use custom handling for XPath selector
if (config.selector) {
if (config.selector.startsWith("/")) {
await waitForXPath(
page,
config.selector,
config.waitForSelectorTimeout ?? 1000,
);
} else {
await page.waitForSelector(config.selector, {
timeout: config.waitForSelectorTimeout ?? 1000,
});
}
}
crawler = new PlaywrightCrawler(
{
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
const title = await page.title();
pageCounter++;
log.info(
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
);

const html = await getPageHtml(page, config.selector);
// Use custom handling for XPath selector
if (config.selector) {
if (config.selector.startsWith("/")) {
await waitForXPath(
page,
config.selector,
config.waitForSelectorTimeout ?? 1000,
);
} else {
await page.waitForSelector(config.selector, {
timeout: config.waitForSelectorTimeout ?? 1000,
});
}
}

// Save results as JSON to ./storage/datasets/default
await pushData({ title, url: request.loadedUrl, html });
const html = await getPageHtml(page, config.selector);

if (config.onVisitPage) {
await config.onVisitPage({ page, pushData });
}
// Save results as JSON to ./storage/datasets/default
await pushData({ title, url: request.loadedUrl, html });

// Extract links from the current page
// and add them to the crawling queue.
await enqueueLinks({
globs:
typeof config.match === "string" ? [config.match] : config.match,
exclude:
typeof config.exclude === "string"
? [config.exclude]
: config.exclude ?? [],
});
},
// Comment this option to scrape the full website.
maxRequestsPerCrawl: config.maxPagesToCrawl,
// Uncomment this option to see the browser window.
// headless: false,
preNavigationHooks: [
// Abort requests for certain resource types
async ({ request, page, log }) => {
// If there are no resource exclusions, return
const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
if (RESOURCE_EXCLUSTIONS.length === 0) {
return;
}
if (config.cookie) {
const cookies = (
Array.isArray(config.cookie) ? config.cookie : [config.cookie]
).map((cookie) => {
return {
name: cookie.name,
value: cookie.value,
url: request.loadedUrl,
};
});
await page.context().addCookies(cookies);
if (config.onVisitPage) {
await config.onVisitPage({ page, pushData });
}
await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
route.abort("aborted"),
);
log.info(
`Aborting requests for as this is a resource excluded route`,
);

// Extract links from the current page
// and add them to the crawling queue.
await enqueueLinks({
globs:
typeof config.match === "string" ? [config.match] : config.match,
exclude:
typeof config.exclude === "string"
? [config.exclude]
: config.exclude ?? [],
});
},
],
});
// Comment this option to scrape the full website.
maxRequestsPerCrawl: config.maxPagesToCrawl,
// Uncomment this option to see the browser window.
// headless: false,
preNavigationHooks: [
// Abort requests for certain resource types
async ({ request, page, log }) => {
// If there are no resource exclusions, return
const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
if (RESOURCE_EXCLUSTIONS.length === 0) {
return;
}
if (config.cookie) {
const cookies = (
Array.isArray(config.cookie) ? config.cookie : [config.cookie]
).map((cookie) => {
return {
name: cookie.name,
value: cookie.value,
url: request.loadedUrl,
};
});
await page.context().addCookies(cookies);
}
await page.route(
`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`,
(route) => route.abort("aborted"),
);
log.info(
`Aborting requests for as this is a resource excluded route`,
);
},
],
},
new Configuration({
purgeOnStart: true,
}),
);

const isUrlASitemap = /sitemap.*\.xml$/.test(config.url);

Expand Down

0 comments on commit 892cd9d

Please sign in to comment.