From 4c402b0807303e13752a26daa5a60a832884a0ac Mon Sep 17 00:00:00 2001 From: yindaheng98 Date: Mon, 22 Apr 2024 15:27:28 -0700 Subject: [PATCH] headers --- README.md | 3 +++ citation_crawler/crawlers/common.py | 14 +++++++++++++- setup.py | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5c42e6f..7997329 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,9 @@ optional arguments: * `HTTP_CONCORRENT` * Concurrent HTTP requests * default: `8` +* `HTTP_HEADERS` + * Headers for HTTP requests + * default: None ### Write to a JSON file diff --git a/citation_crawler/crawlers/common.py b/citation_crawler/crawlers/common.py index 2b06ac5..f20a081 100644 --- a/citation_crawler/crawlers/common.py +++ b/citation_crawler/crawlers/common.py @@ -1,5 +1,6 @@ from typing import Optional, Dict, Callable import os +import json from datetime import datetime, timedelta import aiohttp @@ -20,9 +21,20 @@ def getenv_int(key) -> int: return None +def getenv_headers(key) -> Dict: + headers = os.getenv(key) + if headers is not None: + try: + return json.loads(headers) + except: + pass + return None + + http_concorent = getenv_int('HTTP_CONCORRENT') http_sem = Semaphore(http_concorent if http_concorent is not None else 8) file_sem = Semaphore(512) +http_headers = getenv_headers('HTTP_HEADERS') def get_cache_datetime(path) -> datetime: @@ -53,7 +65,7 @@ async def download_item(url: str, path: str, cache_days: int, is_valid: Callable async with http_sem: try: - async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session: + async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False), headers=http_headers) as session: async with session.get(url, proxy=os.getenv("HTTP_PROXY"), timeout=os.getenv("HTTP_TIMEOUT") or 30) as response: diff --git a/setup.py b/setup.py index 673ea98..0eab934 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ setup( name='citation_crawler', - version='2.8.4', + version='2.9', author='yindaheng98', author_email='yindaheng98@gmail.com', url='https://github.com/yindaheng98/citation-crawler',