From 908c7e1cf97af04cf20fe44fd91cb67186f42908 Mon Sep 17 00:00:00 2001 From: yindaheng98 Date: Wed, 20 Mar 2024 15:46:57 -0700 Subject: [PATCH] put homepage --- citation_crawler/crawlers/ss.py | 13 +++++++++++-- setup.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/citation_crawler/crawlers/ss.py b/citation_crawler/crawlers/ss.py index 917dd27..856dd89 100644 --- a/citation_crawler/crawlers/ss.py +++ b/citation_crawler/crawlers/ss.py @@ -27,6 +27,10 @@ def name(self) -> Optional[str]: def dblp_pid(self) -> Optional[str]: return None + def homepage(self) -> Optional[str]: + if 'homepage' in self.data: + return self.data['homepage'] + def dblp_name(self) -> Optional[List[str]]: if 'externalIds' in self.data and self.data['externalIds'] and 'DBLP' in self.data['externalIds']: return self.data['externalIds']['DBLP'] @@ -54,7 +58,7 @@ def list_data_is_valid(text): return json.loads(text) -fields_authors = "externalIds,name,affiliations" +fields_authors = "externalIds,name,affiliations,homepage" root_authors = f"semanticscholar/authors--{fields_authors.replace(',', '-')}" @@ -113,7 +117,7 @@ async def _get_authors_from_author_data(self) -> Iterable[Author]: for author in self.author_data: yield author - async def authors(self) -> Iterable[Author]: + async def authors(self) -> Iterable[SSAuthor]: if 'authors' in self.data and len(self.data['authors']) >= 0: for a in self.data['authors']: if 'authorId' not in a or 'externalIds' not in a or not a['externalIds']: @@ -254,7 +258,10 @@ async def match_authors(self, paper: SSPaper, authors: AsyncIterable[Dict]) -> A async for author in authors: if 'authorId' in author: if author['authorId'] in authorIds: + ss_author = dblp_names[author['name']] write_fields = {} + if ss_author.homepage(): + write_fields["homepage"] = ss_author.homepage() author_kv = {"authorId": author['authorId']} yield author_kv, write_fields, None else: # if there is an author in database but is not really an author @@ -267,5 +274,7 @@ async def match_authors(self, paper: SSPaper, authors: AsyncIterable[Dict]) -> A elif 'name' in author and author['name'] in dblp_names: ss_author = dblp_names[author['name']] write_fields = {"authorId": ss_author.authorId()} + if ss_author.homepage(): + write_fields["homepage"] = ss_author.homepage() author_kv = {"dblp_pid": author["dblp_pid"]} yield author_kv, write_fields, None diff --git a/setup.py b/setup.py index 96e07b6..1f9ec9b 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ setup( name='citation_crawler', - version='2.7', + version='2.8', author='yindaheng98', author_email='yindaheng98@gmail.com', url='https://github.com/yindaheng98/citation-crawler',