Skip to content

Commit

Permalink
put homepage
Browse files Browse the repository at this point in the history
  • Loading branch information
yindaheng98 committed Mar 20, 2024
1 parent a7f7155 commit 908c7e1
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
13 changes: 11 additions & 2 deletions citation_crawler/crawlers/ss.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ def name(self) -> Optional[str]:
def dblp_pid(self) -> Optional[str]:
return None

def homepage(self) -> Optional[str]:
if 'homepage' in self.data:
return self.data['homepage']

def dblp_name(self) -> Optional[List[str]]:
if 'externalIds' in self.data and self.data['externalIds'] and 'DBLP' in self.data['externalIds']:
return self.data['externalIds']['DBLP']
Expand Down Expand Up @@ -54,7 +58,7 @@ def list_data_is_valid(text):
return json.loads(text)


fields_authors = "externalIds,name,affiliations"
fields_authors = "externalIds,name,affiliations,homepage"
root_authors = f"semanticscholar/authors--{fields_authors.replace(',', '-')}"


Expand Down Expand Up @@ -113,7 +117,7 @@ async def _get_authors_from_author_data(self) -> Iterable[Author]:
for author in self.author_data:
yield author

async def authors(self) -> Iterable[Author]:
async def authors(self) -> Iterable[SSAuthor]:
if 'authors' in self.data and len(self.data['authors']) >= 0:
for a in self.data['authors']:
if 'authorId' not in a or 'externalIds' not in a or not a['externalIds']:
Expand Down Expand Up @@ -254,7 +258,10 @@ async def match_authors(self, paper: SSPaper, authors: AsyncIterable[Dict]) -> A
async for author in authors:
if 'authorId' in author:
if author['authorId'] in authorIds:
ss_author = dblp_names[author['name']]
write_fields = {}
if ss_author.homepage():
write_fields["homepage"] = ss_author.homepage()
author_kv = {"authorId": author['authorId']}
yield author_kv, write_fields, None
else: # if there is an author in database but is not really an author
Expand All @@ -267,5 +274,7 @@ async def match_authors(self, paper: SSPaper, authors: AsyncIterable[Dict]) -> A
elif 'name' in author and author['name'] in dblp_names:
ss_author = dblp_names[author['name']]
write_fields = {"authorId": ss_author.authorId()}
if ss_author.homepage():
write_fields["homepage"] = ss_author.homepage()
author_kv = {"dblp_pid": author["dblp_pid"]}
yield author_kv, write_fields, None
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

setup(
name='citation_crawler',
version='2.7',
version='2.8',
author='yindaheng98',
author_email='[email protected]',
url='https://github.com/yindaheng98/citation-crawler',
Expand Down

0 comments on commit 908c7e1

Please sign in to comment.