Skip to content

Commit

Permalink
Merge pull request #13 from paceaux/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
paceaux authored Sep 6, 2022
2 parents e9245f6 + 5626c40 commit c5d0d3e
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 14 deletions.
50 changes: 44 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,34 +34,72 @@ npm i -g selector-hound

## Usage

### Basic Scanning
Only scan the first 20 pages for `.yourthing`

```
SelectorHound --sitemap=https://wherever.com/xml --limit=20 --selector=".yourthing"
SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing"
```

Scan the first 20 pages and take screenshots
### Re-using, regenerating, and providing a list of links
Before the site scanning begins, this generates a `<site>.sitemap.json` file containing all of the links it will scan. This file is generated from the `sitemap.xml` file you provided **or** from crawling the site looking for links. To improve performance, SelectorHound will look for this file _first_ before attempting to retrieve/generate a sitemap.

If you want to re-generate this `<site>.sitemap.json` file, you can force it:

```
SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" -c
SelectorHound --sitemap=https://wherever.com/xml --selector=".yourthing" --dontUseExportedSitemap
SelectorHound -u https://mysite.com/landing -r -s '.yourThing' -X
```

Scan those pages, but treat them like Single Page Applications (`-d`), and search for all the selectors in `mystyles.css`
#### Formatting
By default, SelectorHound will generate a format that's based off of how sitemap XML looks, which is an array of objects with a `loc` property:
```JavaScript
[
{
'loc': 'https://mysite.com/path'
},
{
'loc': 'https://mysite.com/another'
}
]
```
SelectorHound -u https://wherever.com/xml -f "mystyles.css" -d

However, you can also provide your own list of links as just an array of strings:
```JavaScript
[
"https://mysite.com/path",
"https://mysite.com/another"
]
```

Crawl the site, starting from a landing page
```


### Crawling instead of using a sitemap
Crawl the site, starting from a landing page.

```shell
SelectorHound -u https://mysite.com/landing -r -s ".myClass"
```

### Taking Screenshots or dealing with SPAs
Scan the first 20 pages and take screenshots
```shell
SelectorHound -u https://wherever.com/xml -l 20 -s ".yourthing" -c
```

Scan those pages, but treat them like Single Page Applications (`-d`), and search for all the selectors in `mystyles.css`
```shell
SelectorHound -u https://wherever.com/xml -f "mystyles.css" -d

```

### Options

| Option | Alias | Description | Defaults |
|---|---|---|---|
| `--sitemap` |`-u` | Must be fully qualified URL to an XML Sitemap **or** fully qualified URL to a page **if** `crawl` is `true`. Required. | `https://frankmtaylor.com/sitemap.xml` |
| `--dontUseExportedSitemap` |`-X` | if a `<site>.sitemap.json` file has been already been created, ignore it and generate a new one. Optional. | `false` |
| `--limit` | `-l` | Maximum number of pages to crawl. Optional. | `0` |
| `--selector` | `-s` | A valid CSS selector. Required. | `.title` |
| `--cssFile` | `-f` | A CSS file to use instead of a single selector. Optional. | |
Expand Down
22 changes: 19 additions & 3 deletions cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ const { argv } = yargs(hideBin(process.argv))
type: 'boolean',
default: DEFAULT_CRAWL,
})
.option('dontUseExportedSitemap', {
alias: 'X',
description: 'Force the Site Crawler to refetch links and ignore an existing sitemap',
type: 'boolean',
default: false,
})
.option('limit', {
alias: 'l',
description: 'how many pages to crawl',
Expand Down Expand Up @@ -96,6 +102,7 @@ const {
sitemap,
crawl,
limit,
dontUseExportedSitemap,
selector,
outputFileName,
takeScreenshots,
Expand All @@ -109,6 +116,7 @@ const selectorFinderConfig = {
sitemap,
crawl,
limit,
useExportedSitemap: !dontUseExportedSitemap,
selector,
outputFileName,
takeScreenshots,
Expand Down Expand Up @@ -175,7 +183,8 @@ async function main(config) {
${mainConfig.cssFile ? `| cssFile (${cssFile})` : ''}
${mainConfig.selector && !mainConfig.cssFile ? `| CSS Selector (${mainConfig.selector})` : ''}
${mainConfig.isSpa ? '| Handle as Single Page Application' : ''}
${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}
${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}
${mainConfig.useExportedSitemap ? '' : '| Ignore any existing .sitemap.json file and fetch a sitemap or recrawl'}
`;
await log
.toConsole(startMessage)
Expand All @@ -191,6 +200,7 @@ ${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}
{
startPage: mainConfig.sitemap,
shouldCrawl: mainConfig.crawl,
useExportedSitemap: mainConfig.useExportedSitemap,
},
);
await log.toConsole(`
Expand All @@ -199,9 +209,15 @@ ${mainConfig.takeScreenshots ? '| Take Screenshots' : ''}
`);
await siteCrawler.produceSiteLinks();

await log.toConsole(`
if (!mainConfig.useExportedSitemap) {
await log.toConsole(`
||-> Site links exported to ${siteCrawler.exportFileName}
`);
`);
} else {
await log.toConsole(`
||-> Site links read from ${siteCrawler.exportFileName}
`);
}

mainConfig.siteCrawler = siteCrawler;

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "selector-hound",
"version": "1.3.2",
"version": "1.4.0",
"description": "Find an element that matches a particular CSS selector on a website ",
"keywords": [
"CSS",
Expand Down
62 changes: 59 additions & 3 deletions src/site-crawler.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
/* eslint-disable max-len */
const axios = require('axios');
const fs = require('fs');
const Path = require('path');
const cheerio = require('cheerio');
const { Parser } = require('xml2js');

Expand All @@ -14,6 +17,7 @@ const DEFAULT_CONFIG = {
startPage: 'https://frankmtaylor.com',
linkSelector: 'a[href]',
shouldCrawl: false,
useExportedSitemap: true,
};

const DEFAULT_LIBRARIES = {
Expand Down Expand Up @@ -70,12 +74,36 @@ class SiteCrawler {
return linkArray;
}

/**
* @description provides a fully qualified path to the sitemap json file
* @type {string}
*/
get pathToExportedFile() {
return Path.join(process.cwd(), `${this.exportFileName}.${this.outputter.defaultOutputFile}`);
}

/**
* @description determines if the links have already been exported to a file
* @type {boolean}
*/
get hasExportedLinks() {
return fs.existsSync(this.pathToExportedFile);
}

/**
* adds multiple items to the linkSet property
* @param {string[]} linkArray an array of href values
* @param {string[]|object[]} linkArray an array of href values, or objects with a loc property
*/
addLinks(linkArray) {
this.linkSet = new Set([...this.linkSet, ...linkArray]);
const cleanArray = linkArray.map((link) => {
if (typeof link === 'string') {
return link;
}
if (typeof link === 'object' && link.loc) {
return link.loc;
}
});
this.linkSet = new Set([...this.linkSet, ...cleanArray]);
}

/**
Expand Down Expand Up @@ -285,11 +313,39 @@ class SiteCrawler {
}
}

/**
* @description sets links from an existing json file
* @param {string} fileName
*/
async setLinksFromJsonFile(fileName) {
if (!fileName) return;
try {
const existingJson = await fs.promises.readFile(fileName, 'utf-8');
const existingSiteLinks = JSON.parse(existingJson);
this.addLinks(existingSiteLinks);
} catch (setLinksError) {
await this.errorToFileAsync(setLinksError);
}
}

/**
* @description wrapper for crawl and setSitemap that also produces export file
* @param {boolean} [shouldCrawl=this.config.shouldCrawl]
* @param {boolean} [useExportedSitemap=this.config.useExportedSitemap] use existing file if already exists
*/
async produceSiteLinks(shouldCrawl = this.config.shouldCrawl) {
async produceSiteLinks(
shouldCrawl = this.config.shouldCrawl,
useExportedSitemap = this.config.useExportedSitemap,
) {
const shouldNotProduceLinks = useExportedSitemap && this.hasExportedLinks;
if (shouldNotProduceLinks) {
const alreadyExistsMessage = `The file ${this.pathToExportedFile} already exists and recrawling was not forced.`;
await log.infoToFileAsync(alreadyExistsMessage);
await log.toConsole(alreadyExistsMessage);
await this.setLinksFromJsonFile(`${this.exportFileName}.${this.outputter.defaultOutputFile}`);
return;
}

if (shouldCrawl) {
await this.crawl();
} else {
Expand Down
36 changes: 35 additions & 1 deletion test/site-crawler.test.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* eslint-disable no-undef */
const axios = require('axios');

const fs = require('fs');
const SiteCrawler = require('../src/site-crawler');

jest.mock('axios');
Expand Down Expand Up @@ -124,6 +124,10 @@ axios.mockImplementation((url) => {
}
});

afterAll(async () => {
await fs.promises.unlink('frankmtaylor.com.sitemap.json');
});

describe('getting file', () => {
const siteCrawler = new SiteCrawler();
test('getFileAsync', async () => {
Expand All @@ -146,6 +150,8 @@ describe('SiteCrawler:Crawling', () => {
expect(SiteCrawler).toHaveProperty('defaultConfig');
expect(SiteCrawler.defaultConfig).toHaveProperty('startPage', 'https://frankmtaylor.com');
expect(SiteCrawler.defaultConfig).toHaveProperty('linkSelector', 'a[href]');
expect(SiteCrawler.defaultConfig).toHaveProperty('shouldCrawl', false);
expect(SiteCrawler.defaultConfig).toHaveProperty('useExportedSitemap', true);
});
});
describe('static getPageAsync', () => {
Expand Down Expand Up @@ -219,6 +225,25 @@ describe('SiteCrawler:Crawling', () => {
);
});
});
describe('getters', () => {
const siteCrawler = new SiteCrawler();

test('it has an origin', () => {
expect(siteCrawler.origin).toEqual('https://frankmtaylor.com');
});
test('it has an host', () => {
expect(siteCrawler.host).toEqual('frankmtaylor.com');
});
test('exportFileName', () => {
expect(siteCrawler.exportFileName).toEqual('frankmtaylor.com');
});
test('pathToExportedFile', () => {
expect(siteCrawler.pathToExportedFile).toEqual(`${process.cwd()}/frankmtaylor.com.sitemap.json`);
});
test('hasExportedLinks', () => {
expect(siteCrawler.hasExportedLinks).toEqual(false);
});
});
describe('getLinksFromPageAsync', () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com' }, { ajax: axios });
test('it gets relative links and ignores external links', async () => {
Expand Down Expand Up @@ -366,4 +391,13 @@ describe('SiteCrawler: Fetching Sitemap', () => {
expect(siteCrawler.urlset.length).toEqual(7);
});
});
describe('produceSiteLinks', () => {
test('when produceSiteLinks is run, a file is created and it knows it, and still has data', async () => {
const siteCrawler = new SiteCrawler({ startPage: 'https://frankmtaylor.com/sitemap.xml' });
await siteCrawler.produceSiteLinks();
expect(siteCrawler.hasExportedLinks).toEqual(true);
expect(siteCrawler.linkSet.size).toBeGreaterThan(0);
expect(siteCrawler.linkSet.has('http://frankmtaylor.com'));
});
});
});

0 comments on commit c5d0d3e

Please sign in to comment.