From 4cde227f33be1a40ede399458f7407a5a8106a1b Mon Sep 17 00:00:00 2001 From: yindaheng98 Date: Fri, 17 May 2024 20:54:39 -0700 Subject: [PATCH] chinese papers --- citation_crawler/graph.py | 5 ++--- citation_crawler/items.py | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/citation_crawler/graph.py b/citation_crawler/graph.py index 954cc5d..45f7805 100644 --- a/citation_crawler/graph.py +++ b/citation_crawler/graph.py @@ -1,8 +1,7 @@ import abc import logging -import asyncio from tqdm.asyncio import tqdm -from typing import Tuple, Optional, AsyncIterable, List +from typing import Tuple, Optional, AsyncIterable, List, Dict import random from dblp_crawler.gather import gather from .items import Paper @@ -150,7 +149,7 @@ async def _init_papers(self): async for paper, news in tqdm(gather(*tasks), desc="Writing init papers", total=len(tasks)): yield paper, news - async def _bfs_once(self) -> int: + async def _bfs_once(self): # 初始化 if not self.inited: async for paper, news in self._init_papers(): diff --git a/citation_crawler/items.py b/citation_crawler/items.py index 5dce041..76da97f 100644 --- a/citation_crawler/items.py +++ b/citation_crawler/items.py @@ -44,7 +44,7 @@ def title(self) -> str: return None def title_hash(self) -> str: - return re.sub(r"[^0-9a-z]", "", self.title().lower()) + return re.sub(r"[^0-9a-z\u4e00-\u9fa5]", "", self.title().lower()) @abc.abstractmethod def year(self) -> Optional[int]: diff --git a/setup.py b/setup.py index f210e28..ee720ac 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ setup( name='citation_crawler', - version='2.10.1', + version='2.10.2', author='yindaheng98', author_email='yindaheng98@gmail.com', url='https://github.com/yindaheng98/citation-crawler',