Skip to content

Commit

Permalink
first cache
Browse files Browse the repository at this point in the history
  • Loading branch information
yindaheng98 committed Feb 27, 2024
1 parent a26520d commit f4d7e98
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 10 deletions.
35 changes: 26 additions & 9 deletions citation_crawler/summarizers/neo4j.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import abc
import logging
from typing import AsyncIterable
from citation_crawler import Summarizer, Paper
Expand All @@ -6,12 +7,14 @@
from neo4j import AsyncSession
import neo4j.time

from .neo4jcache import Neo4jSummarizerCache

'''Use with dblp-crawler'''

logger = logging.getLogger("graph")


async def add_paper(tx, paper: Paper):
async def add_paper(tx, cache: Neo4jSummarizerCache, paper: Paper):
n4jset = "MERGE (p:Publication {title_hash: $title_hash}) "\
"SET p.title=$title, p.year=$year"
if paper.doi():
Expand All @@ -32,6 +35,19 @@ async def add_paper(tx, paper: Paper):
n4jset += ", p.date=$date"
except Exception as e:
logger.error(f"Cannot parse date {paper.date()}: {e}")
if cache.try_get_paper(
key=paper.title_hash(),
data=dict(
title_hash=paper.title_hash(),
title=paper.title(),
year=paper.year(),
paperId=paper.paperId(),
dblp_id=paper.dblp_id(),
doi=paper.doi(),
date=date
)
):
return
await tx.run(n4jset,
title_hash=paper.title_hash(),
title=paper.title(),
Expand Down Expand Up @@ -87,7 +103,7 @@ async def divide_author(tx, paper: Paper, author_kv, write_fields, division_kv):
title_hash=paper.title_hash(), **write_fields)


async def _add_references(tx, paper: Paper):
async def _add_references(tx, cache: Neo4jSummarizerCache, paper: Paper):
title_hash_exists = set([
title_hash for (title_hash,) in
await (await tx.run("MATCH (a:Publication)-[:CITE]->(p:Publication {title_hash: $title_hash}) RETURN a.title_hash",
Expand All @@ -96,11 +112,11 @@ async def _add_references(tx, paper: Paper):
async for ref in paper.get_references():
if ref.title_hash() in title_hash_exists:
continue
await add_paper(tx, ref)
await add_paper(tx, cache, ref)
await add_reference(tx, paper, ref)


async def _add_citations(tx, paper: Paper):
async def _add_citations(tx, cache: Neo4jSummarizerCache, paper: Paper):
title_hash_exists = set([
title_hash for (title_hash,) in
await (await tx.run("MATCH (p:Publication {title_hash: $title_hash})-[:CITE]->(a:Publication) RETURN a.title_hash",
Expand All @@ -109,19 +125,20 @@ async def _add_citations(tx, paper: Paper):
async for cit in paper.get_references():
if cit.title_hash() in title_hash_exists:
continue
await add_paper(tx, cit)
await add_paper(tx, cache, cit)
await add_reference(tx, cit, paper)


class Neo4jSummarizer(Summarizer):
def __init__(self, session: AsyncSession, *args, **kwargs):
def __init__(self, session: AsyncSession, cache: Neo4jSummarizerCache = Neo4jSummarizerCache(), *args, **kwargs):
super().__init__(*args, **kwargs)
self.session = session
self.cache = cache

async def write_paper(self, paper) -> None:
await self.session.execute_write(add_paper, paper)
await self.session.execute_write(_add_references, paper)
await self.session.execute_write(_add_citations, paper)
await self.session.execute_write(add_paper, self.cache, paper)
await self.session.execute_write(_add_references, self.cache, paper)
await self.session.execute_write(_add_citations, self.cache, paper)

async def write_reference(self, paper, reference) -> None:
await self.session.execute_write(add_reference, paper, reference)
Expand Down
25 changes: 25 additions & 0 deletions citation_crawler/summarizers/neo4jcache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import Dict, List


class Neo4jSummarizerCache:
def __init__(self, size: int = 2**20) -> None:
self.papers: Dict[str, Dict] = dict()
self.keys: List[str] = []
self.size = size

def try_get_paper(self, key: str, data: Dict):
if key not in self.papers:
self.papers[key] = data
self.keys.append(key)
if len(self.keys) > self.size:
key = self.keys.pop(0)
del self.papers[key]
return False
same = True
old_data = self.papers[key]
for k in data:
if k not in old_data or old_data[k] != data[k]:
old_data[k] = data[k]
same = False
self.papers[key] = old_data
return same
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

setup(
name='citation_crawler',
version='2.5.3',
version='2.6',
author='yindaheng98',
author_email='[email protected]',
url='https://github.com/yindaheng98/citation-crawler',
Expand Down

0 comments on commit f4d7e98

Please sign in to comment.