From 2ac09c684a48589973f1554ad67c94ddea4a315f Mon Sep 17 00:00:00 2001
From: "Frank M. Taylor" <frtaylor@redhat.com>
Date: Tue, 6 Sep 2022 12:09:04 -0500
Subject: [PATCH 1/5] adds ability to check if a file already exists and skip
 outputting

---
 src/site-crawler.js       | 34 +++++++++++++++++++++++++++++++++-
 test/site-crawler.test.js | 28 ++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/src/site-crawler.js b/src/site-crawler.js
index 27456f3..664c953 100644
--- a/src/site-crawler.js
+++ b/src/site-crawler.js
@@ -1,4 +1,7 @@
+/* eslint-disable max-len */
 const axios = require('axios');
+const fs = require('fs');
+const Path = require('path');
 const cheerio = require('cheerio');
 const { Parser } = require('xml2js');
 
@@ -14,6 +17,7 @@ const DEFAULT_CONFIG = {
   startPage: 'https://frankmtaylor.com',
   linkSelector: 'a[href]',
   shouldCrawl: false,
+  useExportedSitemap: true,
 };
 
 const DEFAULT_LIBRARIES = {
@@ -70,6 +74,22 @@ class SiteCrawler {
     return linkArray;
   }
 
+  /**
+   * @description provides a fully qualified path to the sitemap json file
+   * @type {string}
+   */
+  get pathToExportedFile() {
+    return Path.join(process.cwd(), `${this.exportFileName}.${this.outputter.defaultOutputFile}`);
+  }
+
+  /**
+   * @description determines if the links have already been exported to a file
+   * @type {boolean}
+   */
+  get hasExportedLinks() {
+    return fs.existsSync(this.pathToExportedFile);
+  }
+
   /**
    * adds multiple items to the linkSet property
    * @param  {string[]} linkArray an array of href values
@@ -288,8 +308,20 @@ class SiteCrawler {
   /**
    * @description wrapper for crawl and setSitemap that also produces export file
    * @param  {boolean} [shouldCrawl=this.config.shouldCrawl]
+   * @param {boolean} [useExportedSitemap=this.config.useExportedSitemap] use existing file if already exists
    */
-  async produceSiteLinks(shouldCrawl = this.config.shouldCrawl) {
+  async produceSiteLinks(
+    shouldCrawl = this.config.shouldCrawl,
+    useExportedSitemap = this.config.useExportedSitemap,
+  ) {
+    const shouldNotProduceLinks = useExportedSitemap && this.hasExportedLinks;
+    if (shouldNotProduceLinks) {
+      const alreadyExistsMessage = `The file ${this.pathToExportedFile} already exists and recrawling was not forced.`;
+      await log.infoToFileAsync(alreadyExistsMessage);
+      await log.toConsole(alreadyExistsMessage);
+      return;
+    }
+
     if (shouldCrawl) {
       await this.crawl();
     } else {
diff --git a/test/site-crawler.test.js b/test/site-crawler.test.js
index 2f31997..2e39f85 100644
--- a/test/site-crawler.test.js
+++ b/test/site-crawler.test.js
@@ -146,6 +146,8 @@ describe('SiteCrawler:Crawling', () => {
       expect(SiteCrawler).toHaveProperty('defaultConfig');
       expect(SiteCrawler.defaultConfig).toHaveProperty('startPage', 'https://frankmtaylor.com');
       expect(SiteCrawler.defaultConfig).toHaveProperty('linkSelector', 'a[href]');
+      expect(SiteCrawler.defaultConfig).toHaveProperty('shouldCrawl', false);
+      expect(SiteCrawler.defaultConfig).toHaveProperty('useExportedSitemap', true);
     });
   });
   describe('static getPageAsync', () => {
@@ -219,6 +221,25 @@ describe('SiteCrawler:Crawling', () => {
       );
     });
   });
+  describe('getters', () => {
+    const siteCrawler = new SiteCrawler();
+
+    test('it has an origin', () => {
+      expect(siteCrawler.origin).toEqual('https://frankmtaylor.com');
+    });
+    test('it has an host', () => {
+      expect(siteCrawler.host).toEqual('frankmtaylor.com');
+    });
+    test('exportFileName', () => {
+      expect(siteCrawler.exportFileName).toEqual('frankmtaylor.com');
+    });
+    test('pathToExportedFile', () => {
+      expect(siteCrawler.pathToExportedFile).toEqual(`${process.cwd()}/frankmtaylor.com.sitemap.json`);
+    });
+    test('hasExportedLinks', () => {
+      expect(siteCrawler.hasExportedLinks).toEqual(false);
+    });
+  });
   describe('getLinksFromPageAsync', () => {
     const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com' }, { ajax: axios });
     test('it gets relative links and ignores external links', async () => {
@@ -366,4 +387,11 @@ describe('SiteCrawler: Fetching Sitemap', () => {
       expect(siteCrawler.urlset.length).toEqual(7);
     });
   });
+  describe('produceSiteLinks', () => {
+    test('when produceSiteLinks is run, a file is created and it knows it', async () => {
+      const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
+      await siteCrawler.produceSiteLinks();
+      expect(siteCrawler.hasExportedLinks).toEqual(true);
+    });
+  });
 });

From 9876ef5c4963151be5e353ba94949fc020be79ad Mon Sep 17 00:00:00 2001
From: "Frank M. Taylor" <frtaylor@redhat.com>
Date: Tue, 6 Sep 2022 13:07:06 -0500
Subject: [PATCH 2/5] Sets ability to read from an existing json file

---
 src/site-crawler.js       | 28 ++++++++++++++++++++++++++--
 test/site-crawler.test.js | 10 ++++++++--
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/site-crawler.js b/src/site-crawler.js
index 664c953..5cf08db 100644
--- a/src/site-crawler.js
+++ b/src/site-crawler.js
@@ -92,10 +92,18 @@ class SiteCrawler {
 
   /**
    * adds multiple items to the linkSet property
-   * @param  {string[]} linkArray an array of href values
+   * @param  {string[]|object[]} linkArray an array of href values, or objects with a loc property
    */
   addLinks(linkArray) {
-    this.linkSet = new Set([...this.linkSet, ...linkArray]);
+    const cleanArray = linkArray.map((link) => {
+      if (typeof link === 'string') {
+        return link;
+      }
+      if (typeof link === 'object' && link.loc) {
+        return link.loc;
+      }
+    });
+    this.linkSet = new Set([...this.linkSet, ...cleanArray]);
   }
 
   /**
@@ -305,6 +313,21 @@ class SiteCrawler {
     }
   }
 
+  /**
+   * @description sets links from an existing json file
+   * @param  {string} fileName
+   */
+  async setLinksFromJsonFile(fileName) {
+    if (!fileName) return;
+    try {
+      const existingJson = await fs.promises.readFile(fileName, 'utf-8');
+      const existingSiteLinks = JSON.parse(existingJson);
+      this.addLinks(existingSiteLinks);
+    } catch (setLinksError) {
+      await this.errorToFileAsync(setLinksError);
+    }
+  }
+
   /**
    * @description wrapper for crawl and setSitemap that also produces export file
    * @param  {boolean} [shouldCrawl=this.config.shouldCrawl]
@@ -319,6 +342,7 @@ class SiteCrawler {
       const alreadyExistsMessage = `The file ${this.pathToExportedFile} already exists and recrawling was not forced.`;
       await log.infoToFileAsync(alreadyExistsMessage);
       await log.toConsole(alreadyExistsMessage);
+      await this.setLinksFromJsonFile(`${this.exportFileName}.${this.outputter.defaultOutputFile}`);
       return;
     }
 
diff --git a/test/site-crawler.test.js b/test/site-crawler.test.js
index 2e39f85..6000cac 100644
--- a/test/site-crawler.test.js
+++ b/test/site-crawler.test.js
@@ -1,6 +1,6 @@
 /* eslint-disable no-undef */
 const axios = require('axios');
-
+const fs = require('fs');
 const SiteCrawler = require('../src/site-crawler');
 
 jest.mock('axios');
@@ -124,6 +124,10 @@ axios.mockImplementation((url) => {
   }
 });
 
+afterAll(async () => {
+  await fs.promises.unlink('frankmtaylor.com.sitemap.json');
+});
+
 describe('getting file', () => {
   const siteCrawler = new SiteCrawler();
   test('getFileAsync', async () => {
@@ -388,10 +392,12 @@ describe('SiteCrawler: Fetching Sitemap', () => {
     });
   });
   describe('produceSiteLinks', () => {
-    test('when produceSiteLinks is run, a file is created and it knows it', async () => {
+    test('when produceSiteLinks is run, a file is created and it knows it, and still has data', async () => {
       const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
       await siteCrawler.produceSiteLinks();
       expect(siteCrawler.hasExportedLinks).toEqual(true);
+      expect(siteCrawler.linkSet.size).toBeGreaterThan(0);
+      expect(siteCrawler.linkSet.has('http://frankmtaylor.com'));
     });
   });
 });

From fb749fd8b340a51c3fba5991979edc5b559dd285 Mon Sep 17 00:00:00 2001
From: "Frank M. Taylor" <frtaylor@redhat.com>
Date: Tue, 6 Sep 2022 14:19:24 -0500
Subject: [PATCH 3/5] adds ability to turn off relying on exportedsitemap

---
 cli.js | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/cli.js b/cli.js
index 8622f54..6b57cf7 100755
--- a/cli.js
+++ b/cli.js
@@ -42,6 +42,12 @@ const { argv } = yargs(hideBin(process.argv))
     type: 'boolean',
     default: DEFAULT_CRAWL,
   })
+  .option('dontUseExportedSitemap', {
+    alias: 'X',
+    description: 'Force the Site Crawler to refetch links and ignore an existing sitemap',
+    type: 'boolean',
+    default: false,
+  })
   .option('limit', {
     alias: 'l',
     description: 'how many pages to crawl',
@@ -96,6 +102,7 @@ const {
   sitemap,
   crawl,
   limit,
+  dontUseExportedSitemap,
   selector,
   outputFileName,
   takeScreenshots,
@@ -109,6 +116,7 @@ const selectorFinderConfig = {
   sitemap,
   crawl,
   limit,
+  useExportedSitemap: !dontUseExportedSitemap,
   selector,
   outputFileName,
   takeScreenshots,
@@ -175,7 +183,8 @@ async function main(config) {
 ${mainConfig.cssFile ? `| cssFile (${cssFile})` : ''}         
 ${mainConfig.selector && !mainConfig.cssFile ? `| CSS Selector (${mainConfig.selector})` : ''}         
 ${mainConfig.isSpa ? '| Handle as Single Page Application' : ''}         
-${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}         
+${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}  
+${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file and fetch a sitemap or recrawl'}       
 `;
     await log
       .toConsole(startMessage)
@@ -191,6 +200,7 @@ ${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}
       {
         startPage: mainConfig.sitemap,
         shouldCrawl: mainConfig.crawl,
+        useExportedSitemap: mainConfig.useExportedSitemap,
       },
     );
     await log.toConsole(`
@@ -199,9 +209,15 @@ ${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}
       `);
     await siteCrawler.produceSiteLinks();
 
-    await log.toConsole(`
+    if (!mainConfig.useExportedSitemap) {
+      await log.toConsole(`
       ||-> Site links exported to ${siteCrawler.exportFileName}
-    `);
+      `);
+    } else {
+      await log.toConsole(`
+      ||-> Site links read from ${siteCrawler.exportFileName}
+      `);
+    }
 
     mainConfig.siteCrawler = siteCrawler;
 

From 57f887d8c6862a8e13523ef8c7a6657f864b9165 Mon Sep 17 00:00:00 2001
From: "Frank M. Taylor" <frtaylor@redhat.com>
Date: Tue, 6 Sep 2022 14:32:11 -0500
Subject: [PATCH 4/5] updates readme

---
 README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 21e3cde..c929789 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ npm i -g selector-hound
 
 ## Usage
 
+### Basic Scanning
 Only scan the first 20 pages for `.yourthing`
 
 ```
@@ -41,27 +42,64 @@ SelectorHound --sitemap=https://wherever.com/xml --limit=20 --selector=".yourthi
 SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing"
 ```
 
-Scan the first 20 pages and take screenshots
+### Re-using, regenerating, and providing a list of links
+Before the site scanning begins, this generates a `<site>.sitemap.json` file containing all of the links it will scan. This file is generated from the `sitemap.xml` file you provided **or** from crawling the site looking for links. To improve performance, SelectorHound will look for this file _first_ before attempting to retrieve/generate a sitemap. 
+
+If you want to re-generate this `<site>.sitemap.json` file, you can force it:
+
 ```
-SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" -c
+SelectorHound --sitemap=https://wherever.com/xml  --selector=".yourthing" --dontUseExportedSitemap
+SelectorHound -u https://mysite.com/landing -r -s '.yourThing' -X
 ```
 
-Scan those pages, but treat them like Single Page Applications (`-d`), and search for all the selectors in `mystyles.css`
+#### Formatting
+By default, SelectorHound will generate a format that's based off of how sitemap XML looks, which is an array of objects with a `loc` property:
+```JavaScript
+[
+    {
+        'loc': 'https://mysite.com/path'
+    },
+    {
+        'loc': 'https://mysite.com/another'
+    }
+]
 ```
-SelectorHound -u https://wherever.com/xml -f "mystyles.css" -d
 
+However, you can also provide your own list of links as just an array of strings:
+```JavaScript
+    [
+        "https://mysite.com/path",
+        "https://mysite.com/another"
+    ]
 ```
 
-Crawl the site, starting from a landing page
-```
+
+
+### Crawling instead of using a sitemap
+Crawl the site, starting from a landing page.
+
+```shell
 SelectorHound -u https://mysite.com/landing -r -s ".myClass"
 ```
 
+### Taking Screenshots or dealing with SPAs
+Scan the first 20 pages and take screenshots
+```shell
+SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" -c
+```
+
+Scan those pages, but treat them like Single Page Applications (`-d`), and search for all the selectors in `mystyles.css`
+```shell
+SelectorHound -u https://wherever.com/xml -f "mystyles.css" -d
+
+```
+
 ### Options
 
 | Option | Alias | Description   | Defaults  |
 |---|---|---|---|
 | `--sitemap` |`-u`  | Must be fully qualified URL to an XML Sitemap **or** fully qualified URL to a page **if** `crawl` is `true`. Required. | `https://frankmtaylor.com/sitemap.xml` |
+| `--dontUseExportedSitemap` |`-X`  | if a `<site>.sitemap.json` file has been already been created, ignore it and generate a new one. Optional. | `false` |
 | `--limit` | `-l`  |  Maximum number of pages to crawl. Optional. | `0`  |
 | `--selector` | `-s`  |  A valid CSS selector. Required. |  `.title` |
 | `--cssFile` | `-f`  | A CSS file to use instead of a single selector. Optional. |   |

From 5626c402713c44d87662ff71e7ee6b87ba506103 Mon Sep 17 00:00:00 2001
From: "Frank M. Taylor" <frtaylor@redhat.com>
Date: Tue, 6 Sep 2022 14:36:52 -0500
Subject: [PATCH 5/5] Updates version

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index fb71f99..14f509a 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "selector-hound",
-  "version": "1.3.2",
+  "version": "1.4.0",
   "description": "Find an element that matches a particular CSS selector on a website ",
   "keywords": [
     "CSS",