Merge pull request #13 from paceaux/develop

Develop
paceaux · Sep 6, 2022 · c5d0d3e · c5d0d3e
2 parents e9245f6 + 5626c40
commit c5d0d3e
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -34,34 +34,72 @@ npm i -g selector-hound
 
 ## Usage
 
+### Basic Scanning
 Only scan the first 20 pages for `.yourthing`
 
 ```
 SelectorHound --sitemap=https://wherever.com/xml --limit=20 --selector=".yourthing"
 SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing"
 ```
 
-Scan the first 20 pages and take screenshots
+### Re-using, regenerating, and providing a list of links
+Before the site scanning begins, this generates a `<site>.sitemap.json` file containing all of the links it will scan. This file is generated from the `sitemap.xml` file you provided **or** from crawling the site looking for links. To improve performance, SelectorHound will look for this file _first_ before attempting to retrieve/generate a sitemap. 
+
+If you want to re-generate this `<site>.sitemap.json` file, you can force it:
+
 ```
-SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" -c
+SelectorHound --sitemap=https://wherever.com/xml  --selector=".yourthing" --dontUseExportedSitemap
+SelectorHound -u https://mysite.com/landing -r -s '.yourThing' -X
 ```
 
-Scan those pages, but treat them like Single Page Applications (`-d`), and search for all the selectors in `mystyles.css`
+#### Formatting
+By default, SelectorHound will generate a format that's based off of how sitemap XML looks, which is an array of objects with a `loc` property:
+```JavaScript
+[
+    {
+        'loc': 'https://mysite.com/path'
+    },
+    {
+        'loc': 'https://mysite.com/another'
+    }
+]
 ```
-SelectorHound -u https://wherever.com/xml -f "mystyles.css" -d
 
+However, you can also provide your own list of links as just an array of strings:
+```JavaScript
+    [
+        "https://mysite.com/path",
+        "https://mysite.com/another"
+    ]
 ```
 
-Crawl the site, starting from a landing page
-```
+
+
+### Crawling instead of using a sitemap
+Crawl the site, starting from a landing page.
+
+```shell
 SelectorHound -u https://mysite.com/landing -r -s ".myClass"
 ```
 
+### Taking Screenshots or dealing with SPAs
+Scan the first 20 pages and take screenshots
+```shell
+SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" -c
+```
+
+Scan those pages, but treat them like Single Page Applications (`-d`), and search for all the selectors in `mystyles.css`
+```shell
+SelectorHound -u https://wherever.com/xml -f "mystyles.css" -d
+
+```
+
 ### Options
 
 | Option | Alias | Description   | Defaults  |
 |---|---|---|---|
 | `--sitemap` |`-u`  | Must be fully qualified URL to an XML Sitemap **or** fully qualified URL to a page **if** `crawl` is `true`. Required. | `https://frankmtaylor.com/sitemap.xml` |
+| `--dontUseExportedSitemap` |`-X`  | if a `<site>.sitemap.json` file has been already been created, ignore it and generate a new one. Optional. | `false` |
 | `--limit` | `-l`  |  Maximum number of pages to crawl. Optional. | `0`  |
 | `--selector` | `-s`  |  A valid CSS selector. Required. |  `.title` |
 | `--cssFile` | `-f`  | A CSS file to use instead of a single selector. Optional. |   |

diff --git a/cli.js b/cli.js
@@ -42,6 +42,12 @@ const { argv } = yargs(hideBin(process.argv))
     type: 'boolean',
     default: DEFAULT_CRAWL,
   })
+  .option('dontUseExportedSitemap', {
+    alias: 'X',
+    description: 'Force the Site Crawler to refetch links and ignore an existing sitemap',
+    type: 'boolean',
+    default: false,
+  })
   .option('limit', {
     alias: 'l',
     description: 'how many pages to crawl',
@@ -96,6 +102,7 @@ const {
   sitemap,
   crawl,
   limit,
+  dontUseExportedSitemap,
   selector,
   outputFileName,
   takeScreenshots,
@@ -109,6 +116,7 @@ const selectorFinderConfig = {
   sitemap,
   crawl,
   limit,
+  useExportedSitemap: !dontUseExportedSitemap,
   selector,
   outputFileName,
   takeScreenshots,
@@ -175,7 +183,8 @@ async function main(config) {
 ${mainConfig.cssFile ? `| cssFile (${cssFile})` : ''}         
 ${mainConfig.selector && !mainConfig.cssFile ? `| CSS Selector (${mainConfig.selector})` : ''}         
 ${mainConfig.isSpa ? '| Handle as Single Page Application' : ''}         
-${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}         
+${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}  
+${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file and fetch a sitemap or recrawl'}       
 `;
     await log
       .toConsole(startMessage)
@@ -191,6 +200,7 @@ ${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}
       {
         startPage: mainConfig.sitemap,
         shouldCrawl: mainConfig.crawl,
+        useExportedSitemap: mainConfig.useExportedSitemap,
       },
     );
     await log.toConsole(`
@@ -199,9 +209,15 @@ ${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}
       `);
     await siteCrawler.produceSiteLinks();
 
-    await log.toConsole(`
+    if (!mainConfig.useExportedSitemap) {
+      await log.toConsole(`
       ||-> Site links exported to ${siteCrawler.exportFileName}
-    `);
+      `);
+    } else {
+      await log.toConsole(`
+      ||-> Site links read from ${siteCrawler.exportFileName}
+      `);
+    }
 
     mainConfig.siteCrawler = siteCrawler;
 

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "selector-hound",
-  "version": "1.3.2",
+  "version": "1.4.0",
   "description": "Find an element that matches a particular CSS selector on a website ",
   "keywords": [
     "CSS",

diff --git a/src/site-crawler.js b/src/site-crawler.js
@@ -1,4 +1,7 @@
+/* eslint-disable max-len */
 const axios = require('axios');
+const fs = require('fs');
+const Path = require('path');
 const cheerio = require('cheerio');
 const { Parser } = require('xml2js');
 
@@ -14,6 +17,7 @@ const DEFAULT_CONFIG = {
   startPage: 'https://frankmtaylor.com',
   linkSelector: 'a[href]',
   shouldCrawl: false,
+  useExportedSitemap: true,
 };
 
 const DEFAULT_LIBRARIES = {
@@ -70,12 +74,36 @@ class SiteCrawler {
     return linkArray;
   }
 
+  /**
+   * @description provides a fully qualified path to the sitemap json file
+   * @type {string}
+   */
+  get pathToExportedFile() {
+    return Path.join(process.cwd(), `${this.exportFileName}.${this.outputter.defaultOutputFile}`);
+  }
+
+  /**
+   * @description determines if the links have already been exported to a file
+   * @type {boolean}
+   */
+  get hasExportedLinks() {
+    return fs.existsSync(this.pathToExportedFile);
+  }
+
   /**
    * adds multiple items to the linkSet property
-   * @param  {string[]} linkArray an array of href values
+   * @param  {string[]|object[]} linkArray an array of href values, or objects with a loc property
    */
   addLinks(linkArray) {
-    this.linkSet = new Set([...this.linkSet, ...linkArray]);
+    const cleanArray = linkArray.map((link) => {
+      if (typeof link === 'string') {
+        return link;
+      }
+      if (typeof link === 'object' && link.loc) {
+        return link.loc;
+      }
+    });
+    this.linkSet = new Set([...this.linkSet, ...cleanArray]);
   }
 
   /**
@@ -285,11 +313,39 @@ class SiteCrawler {
     }
   }
 
+  /**
+   * @description sets links from an existing json file
+   * @param  {string} fileName
+   */
+  async setLinksFromJsonFile(fileName) {
+    if (!fileName) return;
+    try {
+      const existingJson = await fs.promises.readFile(fileName, 'utf-8');
+      const existingSiteLinks = JSON.parse(existingJson);
+      this.addLinks(existingSiteLinks);
+    } catch (setLinksError) {
+      await this.errorToFileAsync(setLinksError);
+    }
+  }
+
   /**
    * @description wrapper for crawl and setSitemap that also produces export file
    * @param  {boolean} [shouldCrawl=this.config.shouldCrawl]
+   * @param {boolean} [useExportedSitemap=this.config.useExportedSitemap] use existing file if already exists
    */
-  async produceSiteLinks(shouldCrawl = this.config.shouldCrawl) {
+  async produceSiteLinks(
+    shouldCrawl = this.config.shouldCrawl,
+    useExportedSitemap = this.config.useExportedSitemap,
+  ) {
+    const shouldNotProduceLinks = useExportedSitemap && this.hasExportedLinks;
+    if (shouldNotProduceLinks) {
+      const alreadyExistsMessage = `The file ${this.pathToExportedFile} already exists and recrawling was not forced.`;
+      await log.infoToFileAsync(alreadyExistsMessage);
+      await log.toConsole(alreadyExistsMessage);
+      await this.setLinksFromJsonFile(`${this.exportFileName}.${this.outputter.defaultOutputFile}`);
+      return;
+    }
+
     if (shouldCrawl) {
       await this.crawl();
     } else {

diff --git a/test/site-crawler.test.js b/test/site-crawler.test.js
@@ -1,6 +1,6 @@
 /* eslint-disable no-undef */
 const axios = require('axios');
-
+const fs = require('fs');
 const SiteCrawler = require('../src/site-crawler');
 
 jest.mock('axios');
@@ -124,6 +124,10 @@ axios.mockImplementation((url) => {
   }
 });
 
+afterAll(async () => {
+  await fs.promises.unlink('frankmtaylor.com.sitemap.json');
+});
+
 describe('getting file', () => {
   const siteCrawler = new SiteCrawler();
   test('getFileAsync', async () => {
@@ -146,6 +150,8 @@ describe('SiteCrawler:Crawling', () => {
       expect(SiteCrawler).toHaveProperty('defaultConfig');
       expect(SiteCrawler.defaultConfig).toHaveProperty('startPage', 'https://frankmtaylor.com');
       expect(SiteCrawler.defaultConfig).toHaveProperty('linkSelector', 'a[href]');
+      expect(SiteCrawler.defaultConfig).toHaveProperty('shouldCrawl', false);
+      expect(SiteCrawler.defaultConfig).toHaveProperty('useExportedSitemap', true);
     });
   });
   describe('static getPageAsync', () => {
@@ -219,6 +225,25 @@ describe('SiteCrawler:Crawling', () => {
       );
     });
   });
+  describe('getters', () => {
+    const siteCrawler = new SiteCrawler();
+
+    test('it has an origin', () => {
+      expect(siteCrawler.origin).toEqual('https://frankmtaylor.com');
+    });
+    test('it has an host', () => {
+      expect(siteCrawler.host).toEqual('frankmtaylor.com');
+    });
+    test('exportFileName', () => {
+      expect(siteCrawler.exportFileName).toEqual('frankmtaylor.com');
+    });
+    test('pathToExportedFile', () => {
+      expect(siteCrawler.pathToExportedFile).toEqual(`${process.cwd()}/frankmtaylor.com.sitemap.json`);
+    });
+    test('hasExportedLinks', () => {
+      expect(siteCrawler.hasExportedLinks).toEqual(false);
+    });
+  });
   describe('getLinksFromPageAsync', () => {
     const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com' }, { ajax: axios });
     test('it gets relative links and ignores external links', async () => {
@@ -366,4 +391,13 @@ describe('SiteCrawler: Fetching Sitemap', () => {
       expect(siteCrawler.urlset.length).toEqual(7);
     });
   });
+  describe('produceSiteLinks', () => {
+    test('when produceSiteLinks is run, a file is created and it knows it, and still has data', async () => {
+      const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
+      await siteCrawler.produceSiteLinks();
+      expect(siteCrawler.hasExportedLinks).toEqual(true);
+      expect(siteCrawler.linkSet.size).toBeGreaterThan(0);
+      expect(siteCrawler.linkSet.has('http://frankmtaylor.com'));
+    });
+  });
 });