Skip to content

Commit

Permalink
Merge pull request #205 from ndaidong/v5.0.1
Browse files Browse the repository at this point in the history
v5.0.1
  • Loading branch information
ndaidong authored Dec 25, 2021
2 parents 9341737 + a4ddfc4 commit adbef9e
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 21 deletions.
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,37 @@ addQueryRules([
extract('https://bad-website.domain/page/article')
````

While adding rules, you can specify a `transform()` function fine-tune article content more thoroughly.

Example rule with transformation:

```js
const { addQueryRules } = require('article-parser')

addQueryRules([
{
patterns: [
/http(s?):\/\/bad-website.domain\/*/
],
selector: '#article_id_here',
transform: ($) => {
// with $ is cheerio's DOM instance which contains article content
// so you can do everything cheerio supports
// for example, here we replace all <h1></h1> with <b></b>
$('h1').replaceWith(function () {
const h1Html = $(this).html()
return `<b>${h1Html}</b>`
})
// at the end, you mush return $
return $
}
}
])
```

Please refer [cheerio's docs](https://cheerio.js.org/) for more info.


#### Configuration methods

In addition, this lib provides some methods to customize default settings. Don't touch them unless you have reason to do that.
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "5.0.0",
"version": "5.0.1",
"name": "article-parser",
"description": "To extract main article from given URL",
"homepage": "https://ndaidong.github.io/article-parser-demo/",
Expand Down
16 changes: 11 additions & 5 deletions src/utils/findRulesByUrl.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,22 @@ const isValidUrl = require('./isValidUrl')

const { getQueryRules } = require('../config')

const findRulesByUrl = (url) => {
const findRulesByUrl = (urls = []) => {
const rules = getQueryRules()
const matches = !isValidUrl(url)
? []
: rules.filter(({ patterns = [] }) => {
const xurls = urls.filter(isValidUrl)
for (let i = rules.length - 1; i >= 0; i--) {
const rule = rules[i]
const { patterns } = rule
const matched = xurls.some((url) => {
return patterns.some((pattern) => {
return pattern.test(url)
})
})
return matches.length > 0 ? matches[0] : {}
if (matched) {
return rule
}
}
return {}
}

module.exports = findRulesByUrl
16 changes: 6 additions & 10 deletions src/utils/findRulesByUrl.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,19 @@ const findRulesByUrl = require('./findRulesByUrl')
describe('test findRulesByUrl()', () => {
const entries = [
{
url: '',
urls: [{}, ''],
expectation: {}
},
{
url: {},
expectation: {}
},
{
url: 'https://vietnamnet.vn/path/to/article',
urls: [1209, 'https://vietnamnet.vn/path/to/article'],
expectation: (result, expect) => {
expect(result).toBeTruthy()
expect(result).toEqual(expect.objectContaining({ selector: '#ArticleContent' }))
expect(result.selector).toEqual('#ArticleContent')
}
},
{
url: 'https://vnn.vn/path/to/article',
urls: ['https://vnn.vn/path/to/article'],
expectation: (result, expect) => {
expect(result).toBeTruthy()
expect(result).toEqual(expect.objectContaining({ selector: '#ArticleContent' }))
Expand All @@ -34,11 +30,11 @@ describe('test findRulesByUrl()', () => {
]
entries.forEach((entry) => {
const {
url,
urls,
expectation
} = entry
test(`check if findRulesByUrl("${url}") works correctly`, () => {
const result = findRulesByUrl(url)
test('check if findRulesByUrl() works correctly', () => {
const result = findRulesByUrl(urls)
if (isFunction(expectation)) {
expectation(result, expect)
} else {
Expand Down
7 changes: 4 additions & 3 deletions src/utils/parseFromHtml.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ const parseHtml = async (rawhtml, inputUrl = '') => {
// get defined selector
const {
selector = null,
unwanted = []
} = findRulesByUrl(bestUrl)
unwanted = [],
transform = null
} = findRulesByUrl(links)

// find article content
const mainContent = extractWithSelector(html, selector, unwanted)
Expand All @@ -99,7 +100,7 @@ const parseHtml = async (rawhtml, inputUrl = '') => {
contentLengthThreshold
} = getParserOptions()

const normalizedContent = await standalizeArticle(content, bestUrl)
const normalizedContent = await standalizeArticle(content, bestUrl, transform)

const textContent = stripTags(normalizedContent)
if (textContent.length < contentLengthThreshold) {
Expand Down
28 changes: 28 additions & 0 deletions src/utils/parseFromHtml.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ const {

const { isFunction } = require('bellajs')

const {
addQueryRules
} = require('../config')

const parseFromHtml = require('./parseFromHtml')

describe('test parseFromHtml()', () => {
Expand Down Expand Up @@ -101,3 +105,27 @@ describe('test parseFromHtml()', () => {
})
})
})

test('check if parseFromHtml() works with transform rule', async () => {
addQueryRules([
{
patterns: [
/http(s?):\/\/([\w]+.)?need-transform.tld\/*/
],
transform: ($) => {
$('a').replaceWith(function () {
const sHtml = $(this).html()
const link = $(this).attr('href')
return `[link url="${link}"]${sHtml}[/link]`
})
return $
}
}
])
const html = readFileSync('./test-data/vnn-article.html', 'utf8')
const url = 'https://need-transform.tld/path/to/article'
const result = await parseFromHtml(html, url)
expect(result.title).toEqual('Article title here')
expect(result.content).toEqual(expect.not.stringContaining('<a href="https://vnn.vn/dict/watermelon" target="_blank">'))
expect(result.content).toEqual(expect.stringContaining('[link url="https://vnn.vn/dict/watermelon"]watermelon[/link]'))
})
5 changes: 3 additions & 2 deletions src/utils/standalizeArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const absolutifyUrl = require('./absolutifyUrl')

const { getSanitizeHtmlOptions } = require('../config')

module.exports = async (htmlArticle, url) => {
module.exports = async (htmlArticle, url, transform = null) => {
const $ = cheerio.load(htmlArticle, {
normalizeWhitespace: true,
decodeEntities: true
Expand All @@ -30,7 +30,8 @@ module.exports = async (htmlArticle, url) => {
}
})

const minifiedHtml = await htmlmin($.html(), {
const html = transform ? transform($).html() : $.html()
const minifiedHtml = await htmlmin(html, {
removeComments: true,
removeEmptyElements: true,
removeEmptyAttributes: true,
Expand Down

0 comments on commit adbef9e

Please sign in to comment.