From 142858d647e59307a66bb105bb7d37b927f802f7 Mon Sep 17 00:00:00 2001 From: yindaheng98 Date: Mon, 26 Feb 2024 23:30:08 -0800 Subject: [PATCH] cache citations --- citation_crawler/summarizers/neo4j.py | 10 ++++++---- citation_crawler/summarizers/neo4jcache.py | 11 +++++++++++ setup.py | 2 +- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/citation_crawler/summarizers/neo4j.py b/citation_crawler/summarizers/neo4j.py index c7ff48f..05b28e8 100644 --- a/citation_crawler/summarizers/neo4j.py +++ b/citation_crawler/summarizers/neo4j.py @@ -57,7 +57,9 @@ async def add_paper(tx, cache: Neo4jSummarizerCache, paper: Paper): date=date) -async def add_reference(tx, a: Paper, b: Paper): +async def add_reference(tx, cache: Neo4jSummarizerCache, a: Paper, b: Paper): + if await cache.try_get_citation(a.title_hash(), b.title_hash()): + return await tx.run("MERGE (a:Publication {title_hash: $a}) " "MERGE (b:Publication {title_hash: $b}) " "MERGE (a)-[:CITE]->(b)", @@ -112,7 +114,7 @@ async def _add_references(tx, cache: Neo4jSummarizerCache, paper: Paper): if ref.title_hash() in title_hash_exists: continue await add_paper(tx, cache, ref) - await add_reference(tx, paper, ref) + await add_reference(tx, cache, paper, ref) async def _add_citations(tx, cache: Neo4jSummarizerCache, paper: Paper): @@ -125,7 +127,7 @@ async def _add_citations(tx, cache: Neo4jSummarizerCache, paper: Paper): if cit.title_hash() in title_hash_exists: continue await add_paper(tx, cache, cit) - await add_reference(tx, cit, paper) + await add_reference(tx, cache, cit, paper) class Neo4jSummarizer(Summarizer): @@ -140,7 +142,7 @@ async def write_paper(self, paper) -> None: await self.session.execute_write(_add_citations, self.cache, paper) async def write_reference(self, paper, reference) -> None: - await self.session.execute_write(add_reference, paper, reference) + await self.session.execute_write(add_reference, self.cache, paper, reference) async def get_corrlated_authors(self, paper: Paper) -> AsyncIterable[dict]: authors = set() diff --git a/citation_crawler/summarizers/neo4jcache.py b/citation_crawler/summarizers/neo4jcache.py index 9079823..a077f90 100644 --- a/citation_crawler/summarizers/neo4jcache.py +++ b/citation_crawler/summarizers/neo4jcache.py @@ -29,6 +29,9 @@ async def try_get_paper(self, key: str, data: Dict): self.papers[key] = old_data return same + async def try_get_citation(self, cit: str, ref: str): + return False + class Neo4jSummarizerRedisCache(Neo4jSummarizerCache): def __init__(self, client: redis.Redis) -> None: @@ -61,3 +64,11 @@ async def try_get_paper(self, key: str, data: Dict): except Exception as e: logger.error(f"Cannot update cache: {e}, {old_data}") return same + + async def try_get_citation(self, paper: str, ref: str): + key = f"{paper}->{ref}" + exists = await self.client.hget('citations', key) + if exists is None: + await self.client.hset('citations', key, "exists") + return False + return True diff --git a/setup.py b/setup.py index da9512f..2e2cade 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ setup( name='citation_crawler', - version='2.6.1', + version='2.6.2', author='yindaheng98', author_email='yindaheng98@gmail.com', url='https://github.com/yindaheng98/citation-crawler',