Skip to content

Commit

Permalink
cache citations
Browse files Browse the repository at this point in the history
  • Loading branch information
yindaheng98 committed Feb 27, 2024
1 parent 0df686c commit 142858d
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 5 deletions.
10 changes: 6 additions & 4 deletions citation_crawler/summarizers/neo4j.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ async def add_paper(tx, cache: Neo4jSummarizerCache, paper: Paper):
date=date)


async def add_reference(tx, a: Paper, b: Paper):
async def add_reference(tx, cache: Neo4jSummarizerCache, a: Paper, b: Paper):
if await cache.try_get_citation(a.title_hash(), b.title_hash()):
return
await tx.run("MERGE (a:Publication {title_hash: $a}) "
"MERGE (b:Publication {title_hash: $b}) "
"MERGE (a)-[:CITE]->(b)",
Expand Down Expand Up @@ -112,7 +114,7 @@ async def _add_references(tx, cache: Neo4jSummarizerCache, paper: Paper):
if ref.title_hash() in title_hash_exists:
continue
await add_paper(tx, cache, ref)
await add_reference(tx, paper, ref)
await add_reference(tx, cache, paper, ref)


async def _add_citations(tx, cache: Neo4jSummarizerCache, paper: Paper):
Expand All @@ -125,7 +127,7 @@ async def _add_citations(tx, cache: Neo4jSummarizerCache, paper: Paper):
if cit.title_hash() in title_hash_exists:
continue
await add_paper(tx, cache, cit)
await add_reference(tx, cit, paper)
await add_reference(tx, cache, cit, paper)


class Neo4jSummarizer(Summarizer):
Expand All @@ -140,7 +142,7 @@ async def write_paper(self, paper) -> None:
await self.session.execute_write(_add_citations, self.cache, paper)

async def write_reference(self, paper, reference) -> None:
await self.session.execute_write(add_reference, paper, reference)
await self.session.execute_write(add_reference, self.cache, paper, reference)

async def get_corrlated_authors(self, paper: Paper) -> AsyncIterable[dict]:
authors = set()
Expand Down
11 changes: 11 additions & 0 deletions citation_crawler/summarizers/neo4jcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ async def try_get_paper(self, key: str, data: Dict):
self.papers[key] = old_data
return same

async def try_get_citation(self, cit: str, ref: str):
return False


class Neo4jSummarizerRedisCache(Neo4jSummarizerCache):
def __init__(self, client: redis.Redis) -> None:
Expand Down Expand Up @@ -61,3 +64,11 @@ async def try_get_paper(self, key: str, data: Dict):
except Exception as e:
logger.error(f"Cannot update cache: {e}, {old_data}")
return same

async def try_get_citation(self, paper: str, ref: str):
key = f"{paper}->{ref}"
exists = await self.client.hget('citations', key)
if exists is None:
await self.client.hset('citations', key, "exists")
return False
return True
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

setup(
name='citation_crawler',
version='2.6.1',
version='2.6.2',
author='yindaheng98',
author_email='[email protected]',
url='https://github.com/yindaheng98/citation-crawler',
Expand Down

0 comments on commit 142858d

Please sign in to comment.