diff --git a/CHANGELOG.md b/CHANGELOG.md index ac73b4e7..4ab0556d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,8 @@ # [1.5.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.4.0...v1.5.0) (2024-07-05) - ### Features -* git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) +- git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) # [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15) diff --git a/config.ts b/config.ts index f5c958df..6a24846b 100644 --- a/config.ts +++ b/config.ts @@ -6,4 +6,5 @@ export const defaultConfig: Config = { maxPagesToCrawl: 50, outputFileName: "output.json", maxTokens: 2000000, + // proxyUrls: ["http://username:password@proxyserver:port"], // socks5://username:password@proxyserver:port }; diff --git a/src/config.ts b/src/config.ts index 787744ce..0e4f0159 100644 --- a/src/config.ts +++ b/src/config.ts @@ -85,6 +85,10 @@ export const configSchema = z.object({ * @example 5000 */ maxTokens: z.number().int().positive().optional(), + /** Optional proxy server + * @example ['http://username:password@proxyserver:port', 'socks5://username:password@proxyserver:port'] + */ + proxyUrls: z.array(z.string()).optional(), }); export type Config = z.infer; diff --git a/src/core.ts b/src/core.ts index c996f2bb..2e19c4e0 100644 --- a/src/core.ts +++ b/src/core.ts @@ -1,5 +1,10 @@ // For more information, see https://crawlee.dev/ -import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee"; +import { + Configuration, + PlaywrightCrawler, + ProxyConfiguration, + downloadListOfUrls, +} from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { Config, configSchema } from "./config.js"; @@ -54,8 +59,13 @@ export async function crawl(config: Config) { if (process.env.NO_CRAWL !== "true") { // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. + const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: config.proxyUrls, + }); + crawler = new PlaywrightCrawler( { + proxyConfiguration, // Use the requestHandler to process each of the crawled pages. async requestHandler({ request, page, enqueueLinks, log, pushData }) { const title = await page.title();