Skip to content

Commit

Permalink
--no-skip-exists
Browse files Browse the repository at this point in the history
  • Loading branch information
yindaheng98 committed Apr 22, 2024
1 parent fd8cc68 commit dc4444b
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 9 deletions.
3 changes: 2 additions & 1 deletion citation_crawler/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ async def filter_papers(self, papers):
parser_n4j.add_argument("--username", type=str, default=None, help=f'Auth username to neo4j database.')
parser_n4j.add_argument("--password", type=str, default=None, help=f'Auth password to neo4j database.')
parser_n4j.add_argument("--uri", type=str, required=True, help=f'URI to neo4j database.')
parser_n4j.add_argument("--no-skip-exists", action="store_true", help=f'Do not skip exists references. Use it when you want to rewrite all papers.')


async def func_parser_n4j_async(parser):
Expand All @@ -121,7 +122,7 @@ async def func_parser_n4j_async(parser):
logger.info(f"Specified uri and auth: {args.uri} {args.username} {'******' if args.password else 'none'}")
async with AsyncGraphDatabase.driver(args.uri, auth=(args.username, args.password)) as driver:
async with driver.session() as session:
summarizer = DefaultNeo4jSummarizer(session)
summarizer = DefaultNeo4jSummarizer(session, not args.no_skip_exists)
crawler = DefaultSemanticScholarCrawler(
year, keywords,
aid_list,
Expand Down
15 changes: 8 additions & 7 deletions citation_crawler/summarizers/neo4j.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,41 +90,42 @@ async def divide_author(tx, paper: Paper, author_kv, write_fields, division_kv):
title_hash=paper.title_hash(), **write_fields)


async def _add_references(tx, paper: Paper):
async def _add_references(tx, paper: Paper, skip_exists=True):
title_hash_exists = set([
title_hash for (title_hash,) in
await (await tx.run("MATCH (a:Publication)-[:CITE]->(p:Publication {title_hash: $title_hash}) RETURN a.title_hash",
title_hash=paper.title_hash())).values()
])
async for ref in paper.get_references():
if ref.title_hash() in title_hash_exists:
if skip_exists and ref.title_hash() in title_hash_exists:
continue
await add_paper(tx, ref)
await add_reference(tx, paper, ref)


async def _add_citations(tx, paper: Paper):
async def _add_citations(tx, paper: Paper, skip_exists=True):
title_hash_exists = set([
title_hash for (title_hash,) in
await (await tx.run("MATCH (p:Publication {title_hash: $title_hash})-[:CITE]->(a:Publication) RETURN a.title_hash",
title_hash=paper.title_hash())).values()
])
async for cit in paper.get_references():
if cit.title_hash() in title_hash_exists:
if skip_exists and cit.title_hash() in title_hash_exists:
continue
await add_paper(tx, cit)
await add_reference(tx, cit, paper)


class Neo4jSummarizer(Summarizer):
def __init__(self, session: AsyncSession, *args, **kwargs):
def __init__(self, session: AsyncSession, skip_exists=True, *args, **kwargs):
super().__init__(*args, **kwargs)
self.session = session
self.skip_exists = skip_exists

async def write_paper(self, paper) -> None:
await self.session.execute_write(add_paper, paper)
await self.session.execute_write(_add_references, paper)
await self.session.execute_write(_add_citations, paper)
await self.session.execute_write(_add_references, paper, self.skip_exists)
await self.session.execute_write(_add_citations, paper, self.skip_exists)

async def write_reference(self, paper, reference) -> None:
await self.session.execute_write(add_reference, paper, reference)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

setup(
name='citation_crawler',
version='2.8.3',
version='2.8.4',
author='yindaheng98',
author_email='[email protected]',
url='https://github.com/yindaheng98/citation-crawler',
Expand Down

0 comments on commit dc4444b

Please sign in to comment.