From 2ac09c684a48589973f1554ad67c94ddea4a315f Mon Sep 17 00:00:00 2001 From: "Frank M. Taylor" Date: Tue, 6 Sep 2022 12:09:04 -0500 Subject: [PATCH 1/5] adds ability to check if a file already exists and skip outputting --- src/site-crawler.js | 34 +++++++++++++++++++++++++++++++++- test/site-crawler.test.js | 28 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/site-crawler.js b/src/site-crawler.js index 27456f3..664c953 100644 --- a/src/site-crawler.js +++ b/src/site-crawler.js @@ -1,4 +1,7 @@ +/* eslint-disable max-len */ const axios = require('axios'); +const fs = require('fs'); +const Path = require('path'); const cheerio = require('cheerio'); const { Parser } = require('xml2js'); @@ -14,6 +17,7 @@ const DEFAULT_CONFIG = { startPage: 'https://frankmtaylor.com', linkSelector: 'a[href]', shouldCrawl: false, + useExportedSitemap: true, }; const DEFAULT_LIBRARIES = { @@ -70,6 +74,22 @@ class SiteCrawler { return linkArray; } + /** + * @description provides a fully qualified path to the sitemap json file + * @type {string} + */ + get pathToExportedFile() { + return Path.join(process.cwd(), `${this.exportFileName}.${this.outputter.defaultOutputFile}`); + } + + /** + * @description determines if the links have already been exported to a file + * @type {boolean} + */ + get hasExportedLinks() { + return fs.existsSync(this.pathToExportedFile); + } + /** * adds multiple items to the linkSet property * @param {string[]} linkArray an array of href values @@ -288,8 +308,20 @@ class SiteCrawler { /** * @description wrapper for crawl and setSitemap that also produces export file * @param {boolean} [shouldCrawl=this.config.shouldCrawl] + * @param {boolean} [useExportedSitemap=this.config.useExportedSitemap] use existing file if already exists */ - async produceSiteLinks(shouldCrawl = this.config.shouldCrawl) { + async produceSiteLinks( + shouldCrawl = this.config.shouldCrawl, + useExportedSitemap = this.config.useExportedSitemap, + ) { + const shouldNotProduceLinks = useExportedSitemap && this.hasExportedLinks; + if (shouldNotProduceLinks) { + const alreadyExistsMessage = `The file ${this.pathToExportedFile} already exists and recrawling was not forced.`; + await log.infoToFileAsync(alreadyExistsMessage); + await log.toConsole(alreadyExistsMessage); + return; + } + if (shouldCrawl) { await this.crawl(); } else { diff --git a/test/site-crawler.test.js b/test/site-crawler.test.js index 2f31997..2e39f85 100644 --- a/test/site-crawler.test.js +++ b/test/site-crawler.test.js @@ -146,6 +146,8 @@ describe('SiteCrawler:Crawling', () => { expect(SiteCrawler).toHaveProperty('defaultConfig'); expect(SiteCrawler.defaultConfig).toHaveProperty('startPage', 'https://frankmtaylor.com'); expect(SiteCrawler.defaultConfig).toHaveProperty('linkSelector', 'a[href]'); + expect(SiteCrawler.defaultConfig).toHaveProperty('shouldCrawl', false); + expect(SiteCrawler.defaultConfig).toHaveProperty('useExportedSitemap', true); }); }); describe('static getPageAsync', () => { @@ -219,6 +221,25 @@ describe('SiteCrawler:Crawling', () => { ); }); }); + describe('getters', () => { + const siteCrawler = new SiteCrawler(); + + test('it has an origin', () => { + expect(siteCrawler.origin).toEqual('https://frankmtaylor.com'); + }); + test('it has an host', () => { + expect(siteCrawler.host).toEqual('frankmtaylor.com'); + }); + test('exportFileName', () => { + expect(siteCrawler.exportFileName).toEqual('frankmtaylor.com'); + }); + test('pathToExportedFile', () => { + expect(siteCrawler.pathToExportedFile).toEqual(`${process.cwd()}/frankmtaylor.com.sitemap.json`); + }); + test('hasExportedLinks', () => { + expect(siteCrawler.hasExportedLinks).toEqual(false); + }); + }); describe('getLinksFromPageAsync', () => { const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com' }, { ajax: axios }); test('it gets relative links and ignores external links', async () => { @@ -366,4 +387,11 @@ describe('SiteCrawler: Fetching Sitemap', () => { expect(siteCrawler.urlset.length).toEqual(7); }); }); + describe('produceSiteLinks', () => { + test('when produceSiteLinks is run, a file is created and it knows it', async () => { + const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' }); + await siteCrawler.produceSiteLinks(); + expect(siteCrawler.hasExportedLinks).toEqual(true); + }); + }); }); From 9876ef5c4963151be5e353ba94949fc020be79ad Mon Sep 17 00:00:00 2001 From: "Frank M. Taylor" Date: Tue, 6 Sep 2022 13:07:06 -0500 Subject: [PATCH 2/5] Sets ability to read from an existing json file --- src/site-crawler.js | 28 ++++++++++++++++++++++++++-- test/site-crawler.test.js | 10 ++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/site-crawler.js b/src/site-crawler.js index 664c953..5cf08db 100644 --- a/src/site-crawler.js +++ b/src/site-crawler.js @@ -92,10 +92,18 @@ class SiteCrawler { /** * adds multiple items to the linkSet property - * @param {string[]} linkArray an array of href values + * @param {string[]|object[]} linkArray an array of href values, or objects with a loc property */ addLinks(linkArray) { - this.linkSet = new Set([...this.linkSet, ...linkArray]); + const cleanArray = linkArray.map((link) => { + if (typeof link === 'string') { + return link; + } + if (typeof link === 'object' && link.loc) { + return link.loc; + } + }); + this.linkSet = new Set([...this.linkSet, ...cleanArray]); } /** @@ -305,6 +313,21 @@ class SiteCrawler { } } + /** + * @description sets links from an existing json file + * @param {string} fileName + */ + async setLinksFromJsonFile(fileName) { + if (!fileName) return; + try { + const existingJson = await fs.promises.readFile(fileName, 'utf-8'); + const existingSiteLinks = JSON.parse(existingJson); + this.addLinks(existingSiteLinks); + } catch (setLinksError) { + await this.errorToFileAsync(setLinksError); + } + } + /** * @description wrapper for crawl and setSitemap that also produces export file * @param {boolean} [shouldCrawl=this.config.shouldCrawl] @@ -319,6 +342,7 @@ class SiteCrawler { const alreadyExistsMessage = `The file ${this.pathToExportedFile} already exists and recrawling was not forced.`; await log.infoToFileAsync(alreadyExistsMessage); await log.toConsole(alreadyExistsMessage); + await this.setLinksFromJsonFile(`${this.exportFileName}.${this.outputter.defaultOutputFile}`); return; } diff --git a/test/site-crawler.test.js b/test/site-crawler.test.js index 2e39f85..6000cac 100644 --- a/test/site-crawler.test.js +++ b/test/site-crawler.test.js @@ -1,6 +1,6 @@ /* eslint-disable no-undef */ const axios = require('axios'); - +const fs = require('fs'); const SiteCrawler = require('../src/site-crawler'); jest.mock('axios'); @@ -124,6 +124,10 @@ axios.mockImplementation((url) => { } }); +afterAll(async () => { + await fs.promises.unlink('frankmtaylor.com.sitemap.json'); +}); + describe('getting file', () => { const siteCrawler = new SiteCrawler(); test('getFileAsync', async () => { @@ -388,10 +392,12 @@ describe('SiteCrawler: Fetching Sitemap', () => { }); }); describe('produceSiteLinks', () => { - test('when produceSiteLinks is run, a file is created and it knows it', async () => { + test('when produceSiteLinks is run, a file is created and it knows it, and still has data', async () => { const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' }); await siteCrawler.produceSiteLinks(); expect(siteCrawler.hasExportedLinks).toEqual(true); + expect(siteCrawler.linkSet.size).toBeGreaterThan(0); + expect(siteCrawler.linkSet.has('http://frankmtaylor.com')); }); }); }); From fb749fd8b340a51c3fba5991979edc5b559dd285 Mon Sep 17 00:00:00 2001 From: "Frank M. Taylor" Date: Tue, 6 Sep 2022 14:19:24 -0500 Subject: [PATCH 3/5] adds ability to turn off relying on exportedsitemap --- cli.js | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/cli.js b/cli.js index 8622f54..6b57cf7 100755 --- a/cli.js +++ b/cli.js @@ -42,6 +42,12 @@ const { argv } = yargs(hideBin(process.argv)) type: 'boolean', default: DEFAULT_CRAWL, }) + .option('dontUseExportedSitemap', { + alias: 'X', + description: 'Force the Site Crawler to refetch links and ignore an existing sitemap', + type: 'boolean', + default: false, + }) .option('limit', { alias: 'l', description: 'how many pages to crawl', @@ -96,6 +102,7 @@ const { sitemap, crawl, limit, + dontUseExportedSitemap, selector, outputFileName, takeScreenshots, @@ -109,6 +116,7 @@ const selectorFinderConfig = { sitemap, crawl, limit, + useExportedSitemap: !dontUseExportedSitemap, selector, outputFileName, takeScreenshots, @@ -175,7 +183,8 @@ async function main(config) { ${mainConfig.cssFile ? `| cssFile (${cssFile})` : ''} ${mainConfig.selector && !mainConfig.cssFile ? `| CSS Selector (${mainConfig.selector})` : ''} ${mainConfig.isSpa ? '| Handle as Single Page Application' : ''} -${mainConfig.takeScreenshots ? '| Take Screenshots' : ''} +${mainConfig.takeScreenshots ? '| Take Screenshots' : ''} +${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file and fetch a sitemap or recrawl'} `; await log .toConsole(startMessage) @@ -191,6 +200,7 @@ ${mainConfig.takeScreenshots ? '| Take Screenshots' : ''} { startPage: mainConfig.sitemap, shouldCrawl: mainConfig.crawl, + useExportedSitemap: mainConfig.useExportedSitemap, }, ); await log.toConsole(` @@ -199,9 +209,15 @@ ${mainConfig.takeScreenshots ? '| Take Screenshots' : ''} `); await siteCrawler.produceSiteLinks(); - await log.toConsole(` + if (!mainConfig.useExportedSitemap) { + await log.toConsole(` ||-> Site links exported to ${siteCrawler.exportFileName} - `); + `); + } else { + await log.toConsole(` + ||-> Site links read from ${siteCrawler.exportFileName} + `); + } mainConfig.siteCrawler = siteCrawler; From 57f887d8c6862a8e13523ef8c7a6657f864b9165 Mon Sep 17 00:00:00 2001 From: "Frank M. Taylor" Date: Tue, 6 Sep 2022 14:32:11 -0500 Subject: [PATCH 4/5] updates readme --- README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 21e3cde..c929789 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ npm i -g selector-hound ## Usage +### Basic Scanning Only scan the first 20 pages for `.yourthing` ``` @@ -41,27 +42,64 @@ SelectorHound --sitemap=https://wherever.com/xml --limit=20 --selector=".yourthi SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" ``` -Scan the first 20 pages and take screenshots +### Re-using, regenerating, and providing a list of links +Before the site scanning begins, this generates a `.sitemap.json` file containing all of the links it will scan. This file is generated from the `sitemap.xml` file you provided **or** from crawling the site looking for links. To improve performance, SelectorHound will look for this file _first_ before attempting to retrieve/generate a sitemap. + +If you want to re-generate this `.sitemap.json` file, you can force it: + ``` -SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" -c +SelectorHound --sitemap=https://wherever.com/xml --selector=".yourthing" --dontUseExportedSitemap +SelectorHound -u https://mysite.com/landing -r -s '.yourThing' -X ``` -Scan those pages, but treat them like Single Page Applications (`-d`), and search for all the selectors in `mystyles.css` +#### Formatting +By default, SelectorHound will generate a format that's based off of how sitemap XML looks, which is an array of objects with a `loc` property: +```JavaScript +[ + { + 'loc': 'https://mysite.com/path' + }, + { + 'loc': 'https://mysite.com/another' + } +] ``` -SelectorHound -u https://wherever.com/xml -f "mystyles.css" -d +However, you can also provide your own list of links as just an array of strings: +```JavaScript + [ + "https://mysite.com/path", + "https://mysite.com/another" + ] ``` -Crawl the site, starting from a landing page -``` + + +### Crawling instead of using a sitemap +Crawl the site, starting from a landing page. + +```shell SelectorHound -u https://mysite.com/landing -r -s ".myClass" ``` +### Taking Screenshots or dealing with SPAs +Scan the first 20 pages and take screenshots +```shell +SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" -c +``` + +Scan those pages, but treat them like Single Page Applications (`-d`), and search for all the selectors in `mystyles.css` +```shell +SelectorHound -u https://wherever.com/xml -f "mystyles.css" -d + +``` + ### Options | Option | Alias | Description | Defaults | |---|---|---|---| | `--sitemap` |`-u` | Must be fully qualified URL to an XML Sitemap **or** fully qualified URL to a page **if** `crawl` is `true`. Required. | `https://frankmtaylor.com/sitemap.xml` | +| `--dontUseExportedSitemap` |`-X` | if a `.sitemap.json` file has been already been created, ignore it and generate a new one. Optional. | `false` | | `--limit` | `-l` | Maximum number of pages to crawl. Optional. | `0` | | `--selector` | `-s` | A valid CSS selector. Required. | `.title` | | `--cssFile` | `-f` | A CSS file to use instead of a single selector. Optional. | | From 5626c402713c44d87662ff71e7ee6b87ba506103 Mon Sep 17 00:00:00 2001 From: "Frank M. Taylor" Date: Tue, 6 Sep 2022 14:36:52 -0500 Subject: [PATCH 5/5] Updates version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index fb71f99..14f509a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "selector-hound", - "version": "1.3.2", + "version": "1.4.0", "description": "Find an element that matches a particular CSS selector on a website ", "keywords": [ "CSS",