Skip to content

Commit

Permalink
redis cache
Browse files Browse the repository at this point in the history
  • Loading branch information
yindaheng98 committed Feb 27, 2024
1 parent f4d7e98 commit 0df686c
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 6 deletions.
10 changes: 9 additions & 1 deletion citation_crawler/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import asyncio
import logging
import redis.asyncio as redis

from dblp_crawler.keyword.arg import add_argument as add_argument_kw, parse_args as parse_args_kw
from citation_crawler.arg import add_argument_pid, add_argument_aid, parse_args_pid_author
Expand Down Expand Up @@ -112,6 +113,7 @@ async def filter_papers(self, papers):
parser_n4j.add_argument("--username", type=str, default=None, help=f'Auth username to neo4j database.')
parser_n4j.add_argument("--password", type=str, default=None, help=f'Auth password to neo4j database.')
parser_n4j.add_argument("--uri", type=str, required=True, help=f'URI to neo4j database.')
parser_n4j.add_argument("--redis", type=str, default=None, help=f'URI to redis database.')


async def func_parser_n4j_async(parser):
Expand All @@ -121,7 +123,13 @@ async def func_parser_n4j_async(parser):
logger.info(f"Specified uri and auth: {args.uri} {args.username} {'******' if args.password else 'none'}")
async with AsyncGraphDatabase.driver(args.uri, auth=(args.username, args.password)) as driver:
async with driver.session() as session:
summarizer = DefaultNeo4jSummarizer(session)
if args.redis:
from citation_crawler.summarizers import Neo4jSummarizerRedisCache
client = redis.Redis.from_url(args.redis)
cache = Neo4jSummarizerRedisCache(client)
summarizer = DefaultNeo4jSummarizer(session, cache=cache)
else:
summarizer = DefaultNeo4jSummarizer(session)
crawler = DefaultSemanticScholarCrawler(
year, keywords,
aid_list,
Expand Down
3 changes: 2 additions & 1 deletion citation_crawler/summarizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .neo4j import Neo4jSummarizer
from .nx import NetworkxSummarizer
from .neo4jcache import Neo4jSummarizerCache, Neo4jSummarizerRedisCache
from .nx import NetworkxSummarizer
3 changes: 1 addition & 2 deletions citation_crawler/summarizers/neo4j.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import abc
import logging
from typing import AsyncIterable
from citation_crawler import Summarizer, Paper
Expand Down Expand Up @@ -35,7 +34,7 @@ async def add_paper(tx, cache: Neo4jSummarizerCache, paper: Paper):
n4jset += ", p.date=$date"
except Exception as e:
logger.error(f"Cannot parse date {paper.date()}: {e}")
if cache.try_get_paper(
if await cache.try_get_paper(
key=paper.title_hash(),
data=dict(
title_hash=paper.title_hash(),
Expand Down
40 changes: 39 additions & 1 deletion citation_crawler/summarizers/neo4jcache.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import logging
import pickle
from typing import Dict, List
import redis.asyncio as redis

logger = logging.getLogger("cache")


class Neo4jSummarizerCache:
Expand All @@ -7,7 +12,7 @@ def __init__(self, size: int = 2**20) -> None:
self.keys: List[str] = []
self.size = size

def try_get_paper(self, key: str, data: Dict):
async def try_get_paper(self, key: str, data: Dict):
if key not in self.papers:
self.papers[key] = data
self.keys.append(key)
Expand All @@ -23,3 +28,36 @@ def try_get_paper(self, key: str, data: Dict):
same = False
self.papers[key] = old_data
return same


class Neo4jSummarizerRedisCache(Neo4jSummarizerCache):
def __init__(self, client: redis.Redis) -> None:
super().__init__()
self.client = client

async def try_get_paper(self, key: str, data: Dict):
old_data_bin = await self.client.hget('papers', key)
if old_data_bin is None:
try:
data_bin = pickle.dumps(data)
await self.client.hset('papers', key, data_bin)
except Exception as e:
logger.error(f"Cannot set cache: {e}, {data}")
return False
try:
old_data = pickle.loads(old_data_bin)
except Exception as e:
logger.error(f"Cannot get cache: {e}, {old_data_bin}")
return False
same = True
for k in data:
if k not in old_data or old_data[k] != data[k]:
old_data[k] = data[k]
same = False
if not same:
try:
data_bin = pickle.dumps(old_data)
await self.client.hset('papers', key, data_bin)
except Exception as e:
logger.error(f"Cannot update cache: {e}, {old_data}")
return same
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

setup(
name='citation_crawler',
version='2.6',
version='2.6.1',
author='yindaheng98',
author_email='[email protected]',
url='https://github.com/yindaheng98/citation-crawler',
Expand Down

0 comments on commit 0df686c

Please sign in to comment.