--no-skip-exists

yindaheng98 · Apr 22, 2024 · dc4444b · dc4444b
1 parent fd8cc68
commit dc4444b
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 9 deletions.
diff --git a/citation_crawler/__main__.py b/citation_crawler/__main__.py
@@ -112,6 +112,7 @@ async def filter_papers(self, papers):
 parser_n4j.add_argument("--username", type=str, default=None, help=f'Auth username to neo4j database.')
 parser_n4j.add_argument("--password", type=str, default=None, help=f'Auth password to neo4j database.')
 parser_n4j.add_argument("--uri", type=str, required=True, help=f'URI to neo4j database.')
+parser_n4j.add_argument("--no-skip-exists", action="store_true", help=f'Do not skip exists references. Use it when you want to rewrite all papers.')
 
 
 async def func_parser_n4j_async(parser):
@@ -121,7 +122,7 @@ async def func_parser_n4j_async(parser):
     logger.info(f"Specified uri and auth: {args.uri} {args.username} {'******' if args.password else 'none'}")
     async with AsyncGraphDatabase.driver(args.uri, auth=(args.username, args.password)) as driver:
         async with driver.session() as session:
-            summarizer = DefaultNeo4jSummarizer(session)
+            summarizer = DefaultNeo4jSummarizer(session, not args.no_skip_exists)
             crawler = DefaultSemanticScholarCrawler(
                 year, keywords,
                 aid_list,

diff --git a/citation_crawler/summarizers/neo4j.py b/citation_crawler/summarizers/neo4j.py
@@ -90,41 +90,42 @@ async def divide_author(tx, paper: Paper, author_kv, write_fields, division_kv):
                  title_hash=paper.title_hash(), **write_fields)
 
 
-async def _add_references(tx, paper: Paper):
+async def _add_references(tx, paper: Paper, skip_exists=True):
     title_hash_exists = set([
         title_hash for (title_hash,) in
         await (await tx.run("MATCH (a:Publication)-[:CITE]->(p:Publication {title_hash: $title_hash}) RETURN a.title_hash",
                title_hash=paper.title_hash())).values()
     ])
     async for ref in paper.get_references():
-        if ref.title_hash() in title_hash_exists:
+        if skip_exists and ref.title_hash() in title_hash_exists:
             continue
         await add_paper(tx, ref)
         await add_reference(tx, paper, ref)
 
 
-async def _add_citations(tx, paper: Paper):
+async def _add_citations(tx, paper: Paper, skip_exists=True):
     title_hash_exists = set([
         title_hash for (title_hash,) in
         await (await tx.run("MATCH (p:Publication {title_hash: $title_hash})-[:CITE]->(a:Publication) RETURN a.title_hash",
                title_hash=paper.title_hash())).values()
     ])
     async for cit in paper.get_references():
-        if cit.title_hash() in title_hash_exists:
+        if skip_exists and cit.title_hash() in title_hash_exists:
             continue
         await add_paper(tx, cit)
         await add_reference(tx, cit, paper)
 
 
 class Neo4jSummarizer(Summarizer):
-    def __init__(self, session: AsyncSession, *args, **kwargs):
+    def __init__(self, session: AsyncSession, skip_exists=True, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.session = session
+        self.skip_exists = skip_exists
 
     async def write_paper(self, paper) -> None:
         await self.session.execute_write(add_paper, paper)
-        await self.session.execute_write(_add_references, paper)
-        await self.session.execute_write(_add_citations, paper)
+        await self.session.execute_write(_add_references, paper, self.skip_exists)
+        await self.session.execute_write(_add_citations, paper, self.skip_exists)
 
     async def write_reference(self, paper, reference) -> None:
         await self.session.execute_write(add_reference, paper, reference)

diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 setup(
     name='citation_crawler',
-    version='2.8.3',
+    version='2.8.4',
     author='yindaheng98',
     author_email='[email protected]',
     url='https://github.com/yindaheng98/citation-crawler',