BuilderIO · maxime4000 · Nov 28, 2023 · Nov 28, 2023 · Nov 28, 2023 · Nov 28, 2023
diff --git a/.gitignore b/.gitignore
@@ -13,5 +13,6 @@ storage
 !tsconfig.json
 
 # any output from the crawler
-*.json
+output.json
+output
 pnpm-lock.yaml
diff --git a/.prettierignore b/.prettierignore
@@ -0,0 +1,5 @@
+# Add files here to ignore them from prettier formatting
+/dist
+/output
+/storage
+/node_modules
diff --git a/.prettierrc b/.prettierrc
@@ -0,0 +1,23 @@
+{
+	"printWidth": 120,
+	"tabWidth": 4,
+	"useTabs": true,
+	"semi": true,
+	"singleQuote": false,
+	"bracketSpacing": true,
+	"trailingComma": "all",
+	"arrowParens": "avoid",
+	"endOfLine": "lf",
+	"proseWrap": "never",
+	"quoteProps": "as-needed",
+	"jsxSingleQuote": false,
+	"htmlWhitespaceSensitivity": "strict",
+	"overrides": [
+		{
+			"files": "*.json",
+			"options": {
+				"trailingComma": "all"
+			}
+		}
+	]
+}
diff --git a/README.md b/README.md
@@ -4,21 +4,19 @@ Crawl a site to generate knowledge files to create your own custom GPT from one
 
 ![Gif showing the crawl run](https://github.com/BuilderIO/gpt-crawler/assets/844291/feb8763a-152b-4708-9c92-013b5c70d2f2)
 
-- [Example](#example)
-- [Get started](#get-started)
-  - [Running locally](#running-locally)
-    - [Clone the repository](#clone-the-repository)
-    - [Install dependencies](#install-dependencies)
-    - [Configure the crawler](#configure-the-crawler)
-    - [Run your crawler](#run-your-crawler)
-  - [Alternative methods](#alternative-methods)
-    - [Running in a container with Docker](#running-in-a-container-with-docker)
-    - [Running as a CLI](#running-as-a-cli)
-      - [Development](#development)
-  - [Upload your data to OpenAI](#upload-your-data-to-openai)
-    - [Create a custom GPT](#create-a-custom-gpt)
-    - [Create a custom assistant](#create-a-custom-assistant)
-- [Contributing](#contributing)
+-   [Example](#example)
+-   [Get started](#get-started)
+    -   [Running locally](#running-locally)
+        -   [Clone the repository](#clone-the-repository)
+        -   [Install dependencies](#install-dependencies)
+        -   [Configure the crawler](#configure-the-crawler)
+        -   [Run your crawler](#run-your-crawler)
+    -   [Alternative methods](#alternative-methods)
+        -   [Running in a container with Docker](#running-in-a-container-with-docker)
+    -   [Upload your data to OpenAI](#upload-your-data-to-openai)
+        -   [Create a custom GPT](#create-a-custom-gpt)
+        -   [Create a custom assistant](#create-a-custom-assistant)
+-   [Contributing](#contributing)
 
 ## Example
 
@@ -56,34 +54,41 @@ E.g. to crawl the Builder.io docs to make our custom GPT you can use:
 
 ```ts
 export const defaultConfig: Config = {
-  url: "https://www.builder.io/c/docs/developers",
-  match: "https://www.builder.io/c/docs/**",
-  selector: `.docs-builder-container`,
-  maxPagesToCrawl: 50,
-  outputFileName: "output.json",
+	url: "https://www.builder.io/c/docs/developers",
+	match: "https://www.builder.io/c/docs/**",
+	selector: `.docs-builder-container`,
+	excludeSelectors: [],
+	maxPagesToCrawl: 50,
+	outputFileName: "data.json",
 };
 ```
 
 See [config.ts](src/config.ts) for all available options. Here is a sample of the common configu options:
 
 ```ts
 type Config = {
-  /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */
-  url: string;
-  /** Pattern to match against for links on a page to subsequently crawl */
-  match: string;
-  /** Selector to grab the inner text from */
-  selector: string;
-  /** Don't crawl more than this many pages */
-  maxPagesToCrawl: number;
-  /** File name for the finished data */
-  outputFileName: string;
-  /** Optional resources to exclude
-   *
-   * @example
-   * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
-   */
-  resourceExclusions?: string[];
+	/** URL to start the crawl */
+	url: string;
+	/** Pattern to match against for links on a page to subsequently crawl */
+	match: string;
+	/** Selector to grab the inner text from. */
+	selector: string;
+	/** Selectors to exclude the text from the final result */
+	excludeSelectors?: string | string[];
+	/** Don't crawl more than this many pages */
+	maxPagesToCrawl?: number;
+	/** Maximum concurrency level for crawling pages */
+	maxConcurrency?: number;
+	/** Name the dataset. Must be use when having multiple config */
+	name?: string;
+	/** File name for the finished data */
+	outputFileName?: string;
+	/** Optional resources to exclude
+	 *
+	 * @example
+	 * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
+	 */
+	resourceExclusions?: string[];
 };
 ```
 
@@ -97,11 +102,11 @@ npm start
 
 #### [Running in a container with Docker](./containerapp/README.md)
 
-To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
+To obtain the `output/data.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output/data.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
 
 ### Upload your data to OpenAI
 
-The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
+The crawl will generate a file called `output/data.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
 
 #### Create a custom GPT
 

diff --git a/config.ts b/config.ts
@@ -1,8 +1,10 @@
-import { Config } from "./src/config";
+import { ConfigInput } from "./src/config.js";
 
-export const defaultConfig: Config = {
-  url: "https://www.builder.io/c/docs/developers",
-  match: "https://www.builder.io/c/docs/**",
-  maxPagesToCrawl: 50,
-  outputFileName: "output.json",
+export const defaultConfig: ConfigInput | ConfigInput[] = {
+	url: "https://www.builder.io/c/docs/developers",
+	match: "https://www.builder.io/c/docs/**",
+	selector: ".docs-builder-container",
+	excludeSelectors: [],
+	maxPagesToCrawl: 50,
+	outputFileName: "data.json",
 };
diff --git a/containerapp/data/config.ts b/containerapp/data/config.ts
@@ -1,8 +1,10 @@
-import { Config } from "./src/config";
+import { ConfigInput } from "./src/config.js";
 
-export const defaultConfig: Config = {
-  url: "https://www.builder.io/c/docs/developers",
-  match: "https://www.builder.io/c/docs/**",
-  maxPagesToCrawl: 50,
-  outputFileName: "../data/output.json",
+export const defaultConfig: ConfigInput | ConfigInput[] = {
+	url: "https://www.builder.io/c/docs/developers",
+	match: "https://www.builder.io/c/docs/**",
+	selector: "",
+	excludeSelectors: [],
+	maxPagesToCrawl: 50,
+	outputFileName: "data.json",
 };