From 5509f324e549f1a69eecef5cbb760f794466f255 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Sat, 18 May 2024 23:16:11 +0800 Subject: [PATCH] feat: use wikipedia wrapper instead of axios Signed-off-by: Ruihang Xia --- .github/workflows/ci.yml | 4 ++-- package.json | 5 ++--- pnpm-lock.yaml | 43 +++++++++++++++++++++++++--------------- src/App.tsx | 3 --- src/fetch.test.tsx | 19 ++++-------------- src/fetch.tsx | 35 ++++++++++++++++---------------- src/list.tsx | 2 +- 7 files changed, 54 insertions(+), 57 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5473527..518efcf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [ '*' ] + branches: ['*'] pull_request: - branches: [ '*' ] + branches: ['*'] jobs: test: diff --git a/package.json b/package.json index aee1346..a23dc9b 100644 --- a/package.json +++ b/package.json @@ -11,11 +11,10 @@ "test": "vitest" }, "dependencies": { - "axios": "^1.6.8", "cheerio": "1.0.0-rc.12", - "htmlparser2": "^9.1.0", "react": "^18.3.1", - "react-dom": "^18.3.1" + "react-dom": "^18.3.1", + "wikipedia": "^2.1.2" }, "devDependencies": { "@antfu/eslint-config": "^2.16.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3327e96..208a2f6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,21 +8,18 @@ importers: .: dependencies: - axios: - specifier: ^1.6.8 - version: 1.6.8 cheerio: specifier: 1.0.0-rc.12 version: 1.0.0-rc.12 - htmlparser2: - specifier: ^9.1.0 - version: 9.1.0 react: specifier: ^18.3.1 version: 18.3.1 react-dom: specifier: ^18.3.1 version: 18.3.1(react@18.3.1) + wikipedia: + specifier: ^2.1.2 + version: 2.1.2 devDependencies: '@antfu/eslint-config': specifier: ^2.16.1 @@ -1048,6 +1045,10 @@ packages: resolution: {integrity: sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==} engines: {node: '>=6'} + camelcase@4.1.0: + resolution: {integrity: sha512-FxAv7HpHrXbh3aPo4o2qxHay2lkLY3x5Mw3KeE4KQE8ysVfziWeRZDwcjauvwBSGEC/nXUPzZy8zeh4HokqOnw==} + engines: {node: '>=4'} + caniuse-lite@1.0.30001616: resolution: {integrity: sha512-RHVYKov7IcdNjVHJFNY/78RdG4oGVjbayxv8u5IO74Wv7Hlq4PnJE6mo/OjFijjVFNy5ijnCt6H3IIo4t+wfEw==} @@ -1732,9 +1733,6 @@ packages: htmlparser2@8.0.2: resolution: {integrity: sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==} - htmlparser2@9.1.0: - resolution: {integrity: sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==} - http-proxy-agent@7.0.2: resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==} engines: {node: '>= 14'} @@ -1770,6 +1768,9 @@ packages: inflight@1.0.6: resolution: {integrity: sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==} + infobox-parser@3.6.4: + resolution: {integrity: sha512-d2lTlxKZX7WsYxk9/UPt51nkmZv5tbC75SSw4hfHqZ3LpRAn6ug0oru9xI2X+S78va3aUAze3xl/UqMuwLmJUw==} + inherits@2.0.4: resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} @@ -2797,6 +2798,10 @@ packages: engines: {node: '>=8'} hasBin: true + wikipedia@2.1.2: + resolution: {integrity: sha512-RAYaMpXC9/E873RaSEtlEa8dXK4e0p5k98GKOd210MtkE5emm6fcnwD+N6ZA4cuffjDWagvhaQKtp/mGp2BOVQ==} + engines: {node: '>=10'} + word-wrap@1.2.5: resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==} engines: {node: '>=0.10.0'} @@ -4010,6 +4015,8 @@ snapshots: callsites@3.1.0: {} + camelcase@4.1.0: {} + caniuse-lite@1.0.30001616: {} chai@4.4.1: @@ -4943,13 +4950,6 @@ snapshots: domutils: 3.1.0 entities: 4.5.0 - htmlparser2@9.1.0: - dependencies: - domelementtype: 2.3.0 - domhandler: 5.0.3 - domutils: 3.1.0 - entities: 4.5.0 - http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.1 @@ -4986,6 +4986,10 @@ snapshots: once: 1.4.0 wrappy: 1.0.2 + infobox-parser@3.6.4: + dependencies: + camelcase: 4.1.0 + inherits@2.0.4: {} internal-slot@1.0.7: @@ -6078,6 +6082,13 @@ snapshots: siginfo: 2.0.0 stackback: 0.0.2 + wikipedia@2.1.2: + dependencies: + axios: 1.6.8 + infobox-parser: 3.6.4 + transitivePeerDependencies: + - debug + word-wrap@1.2.5: {} wrap-ansi@7.0.0: diff --git a/src/App.tsx b/src/App.tsx index 7becaa4..cdb9364 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -2,9 +2,6 @@ import { useEffect, useState } from 'react' import { build_list } from './list' function App() { - // const [list, setList] = useState([]) - // build_list(100, 2000).then(setList) - const [list, setList] = useState<{ from: number, to: number, person: { desc: string, link: string | undefined, death: number | undefined }, other_people: { desc: string, link: string | undefined, death: number | undefined }[] }[]>([]) useEffect(() => { diff --git a/src/fetch.test.tsx b/src/fetch.test.tsx index 4032a47..84a83b9 100644 --- a/src/fetch.test.tsx +++ b/src/fetch.test.tsx @@ -1,10 +1,8 @@ -import nock from 'nock' import { describe, expect, it } from 'vitest' -import { fetch_people_in } from './fetch' +import { extract_people_from_html } from './fetch' -describe('fetch_people_in', () => { - it('call fetch_people_in with 1990', async () => { - const year = 1990 +describe('extract_people_from_html', () => { + it('call extract_people_from_html with 100', async () => { // scraped from https://en.wikipedia.org/wiki/AD_100 const mockData = `
@@ -29,16 +27,7 @@ describe('fetch_people_in', () => {
  • Wang Chong, Chinese philosopher (b. AD 27)
  • ` - - nock('https://en.wikipedia.org') - .defaultReplyHeaders({ - 'access-control-allow-origin': '*', - 'access-control-allow-credentials': 'true', - }) - .get(`/wiki/AD_${year}`) - .reply(200, mockData) - - const result = await fetch_people_in(year) + const result = extract_people_from_html(mockData) expect(result).toEqual( [{ diff --git a/src/fetch.tsx b/src/fetch.tsx index e88be38..2cc826b 100644 --- a/src/fetch.tsx +++ b/src/fetch.tsx @@ -1,21 +1,22 @@ -import axios from 'axios' import * as cheerio from 'cheerio' - -const WIKIPEDIA_AD_URL = 'https://en.wikipedia.org/wiki/AD_' -// const WIKIPEDIA_AD_URL = 'https://cors-anywhere.herokuapp.com/https://en.wikipedia.org/wiki/AD_' +import wiki from 'wikipedia' export async function fetch_people_in(year: number) { - return axios.get(WIKIPEDIA_AD_URL + year).then((response) => { - const $ = cheerio.load(response.data) - const birthsHeader = $('#Births').parent() - const births = birthsHeader.nextUntil('h2').find('li').map((_, el) => { - const desc = $(el).text() - const link = $(el).find('a').attr('href') - const deathMatch = desc.match(/\(d\. \d+/i)?.[0] - const death = deathMatch ? Number.parseInt(deathMatch.substring(3)) : undefined - return { desc, link, death } - }, - ).get() - return births - }) + const page = await wiki.page(`AD_${year}`) + const html = await page.html({ redirect: true }) + return extract_people_from_html(html) +} + +export function extract_people_from_html(html: string) { + const $ = cheerio.load(html) + const birthsHeader = $('#Births').parent() + const births = birthsHeader.nextUntil('h2').find('li').map((_, el) => { + const desc = $(el).text() + const link = $(el).find('a').attr('href') + const deathMatch = desc.match(/\(d\. \d+/i)?.[0] + const death = deathMatch ? Number.parseInt(deathMatch.substring(3)) : undefined + return { desc, link, death } + }).get() + + return births } diff --git a/src/list.tsx b/src/list.tsx index 74fea90..05feb2a 100644 --- a/src/list.tsx +++ b/src/list.tsx @@ -27,7 +27,7 @@ export async function build_list(start_year: number, end_year: number) { other_people: people_born, }) - year = random_person.death! + year = random_person.death! + 500 } return list