Skip to content

Commit

Permalink
headers
Browse files Browse the repository at this point in the history
  • Loading branch information
yindaheng98 committed Apr 22, 2024
1 parent dc4444b commit 4c402b0
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 2 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ optional arguments:
* `HTTP_CONCORRENT`
* Concurrent HTTP requests
* default: `8`
* `HTTP_HEADERS`
* Headers for HTTP requests
* default: None

### Write to a JSON file

Expand Down
14 changes: 13 additions & 1 deletion citation_crawler/crawlers/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional, Dict, Callable
import os
import json
from datetime import datetime, timedelta

import aiohttp
Expand All @@ -20,9 +21,20 @@ def getenv_int(key) -> int:
return None


def getenv_headers(key) -> Dict:
headers = os.getenv(key)
if headers is not None:
try:
return json.loads(headers)
except:
pass
return None


http_concorent = getenv_int('HTTP_CONCORRENT')
http_sem = Semaphore(http_concorent if http_concorent is not None else 8)
file_sem = Semaphore(512)
http_headers = getenv_headers('HTTP_HEADERS')


def get_cache_datetime(path) -> datetime:
Expand Down Expand Up @@ -53,7 +65,7 @@ async def download_item(url: str, path: str, cache_days: int, is_valid: Callable

async with http_sem:
try:
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False), headers=http_headers) as session:
async with session.get(url,
proxy=os.getenv("HTTP_PROXY"),
timeout=os.getenv("HTTP_TIMEOUT") or 30) as response:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

setup(
name='citation_crawler',
version='2.8.4',
version='2.9',
author='yindaheng98',
author_email='[email protected]',
url='https://github.com/yindaheng98/citation-crawler',
Expand Down

0 comments on commit 4c402b0

Please sign in to comment.