Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add proxy support to search and scraper #104

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions twitter/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def execute_login_flow(client: Client, **kwargs) -> Client | None:
return client


def login(email: str, username: str, password: str, **kwargs) -> Client:
def login(email: str, username: str, password: str, proxies=None, **kwargs) -> Client:
client = Client(
cookies={
"email": email,
Expand All @@ -162,7 +162,8 @@ def login(email: str, username: str, password: str, **kwargs) -> Client:
'x-twitter-active-user': 'yes',
'x-twitter-client-language': 'en',
},
follow_redirects=True
follow_redirects=True,
proxies=proxies
)
client = execute_login_flow(client, **kwargs)
if not client or client.cookies.get('flow_errors') == 'true':
Expand Down
25 changes: 13 additions & 12 deletions twitter/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@


class Scraper:
def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, **kwargs):
def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, proxies=None, **kwargs):
self.save = kwargs.get('save', True)
self.debug = kwargs.get('debug', 0)
self.pbar = kwargs.get('pbar', True)
self.out = Path(kwargs.get('out', 'data'))
self.guest = False
self.logger = self._init_logger(**kwargs)
self.session = self._validate_session(email, username, password, session, **kwargs)
self.proxies = proxies
self.session = self._validate_session(email, username, password, session, proxies=proxies, **kwargs)

def users(self, screen_names: list[str], **kwargs) -> list[dict]:
"""
Expand Down Expand Up @@ -258,7 +259,7 @@ def download_media(self, ids: list[int], photos: bool = True, videos: bool = Tru
[urls.append([url, video]) for video in hq_videos]

async def process():
async with AsyncClient(headers=self.session.headers, cookies=self.session.cookies) as client:
async with AsyncClient(headers=self.session.headers, cookies=self.session.cookies, proxies=self.proxies) as client:
tasks = (download(client, x, y) for x, y in urls)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc='Downloading media')
Expand Down Expand Up @@ -299,7 +300,7 @@ async def process():
offsets = utc or ["-1200", "-1100", "-1000", "-0900", "-0800", "-0700", "-0600", "-0500", "-0400", "-0300",
"-0200", "-0100", "+0000", "+0100", "+0200", "+0300", "+0400", "+0500", "+0600", "+0700",
"+0800", "+0900", "+1000", "+1100", "+1200", "+1300", "+1400"]
async with AsyncClient(headers=get_headers(self.session)) as client:
async with AsyncClient(headers=get_headers(self.session), proxies=self.proxies) as client:
tasks = (get_trends(client, o, url) for o in offsets)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc='Getting trends')
Expand Down Expand Up @@ -453,7 +454,7 @@ async def process():
limits = Limits(max_connections=100, max_keepalive_connections=10)
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20, proxies=self.proxies) as c:
tasks = (get(c, key) for key in keys)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc='Downloading chat data')
Expand Down Expand Up @@ -501,7 +502,7 @@ async def process():
limits = Limits(max_connections=100, max_keepalive_connections=10)
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20, proxies=self.proxies) as c:
return await asyncio.gather(*(get(c, key) for key in keys))

return asyncio.run(process())
Expand Down Expand Up @@ -540,7 +541,7 @@ async def _process(self, operation: tuple, queries: list[dict], **kwargs):
limits = Limits(max_connections=100, max_keepalive_connections=10)
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20, proxies=self.proxies) as c:
tasks = (self._paginate(c, operation, **q, **kwargs) for q in queries)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc=operation[-1])
Expand All @@ -565,7 +566,7 @@ async def _paginate(self, client: AsyncClient, operation: tuple, **kwargs):
ids = set(find_key(initial_data, 'rest_id'))
cursor = get_cursor(initial_data)
except Exception as e:
self.logger.error('Failed to get initial pagination data', e)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this intentional?

self.logger.error('Failed to get initial pagination data')
return
while (dups < DUP_LIMIT) and cursor:
prev_len = len(ids)
Expand Down Expand Up @@ -668,7 +669,7 @@ async def get(c: AsyncClient, space: dict) -> list[dict]:
return r.json()

limits = Limits(max_connections=100)
async with AsyncClient(headers=client.headers, limits=limits, timeout=30) as c:
async with AsyncClient(headers=client.headers, limits=limits, timeout=30, proxies=self.proxies) as c:
tasks = (get(c, _id) for _id in spaces)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc='Getting live transcripts')
Expand Down Expand Up @@ -763,7 +764,7 @@ async def poll_space(client: AsyncClient, space: dict) -> dict | None:
async def process(spaces: list[dict]):
limits = Limits(max_connections=100)
headers, cookies = self.session.headers, self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20, proxies=self.proxies) as c:
return await asyncio.gather(*(poll_space(c, space) for space in spaces))

spaces = self.spaces(rooms=rooms)
Expand Down Expand Up @@ -800,13 +801,13 @@ def _validate_session(self, *args, **kwargs):

# try validating cookies dict
if isinstance(cookies, dict) and all(cookies.get(c) for c in {'ct0', 'auth_token'}):
_session = Client(cookies=cookies, follow_redirects=True)
_session = Client(cookies=cookies, follow_redirects=True, proxies=self.proxies)
_session.headers.update(get_headers(_session))
return _session

# try validating cookies from file
if isinstance(cookies, str):
_session = Client(cookies=orjson.loads(Path(cookies).read_bytes()), follow_redirects=True)
_session = Client(cookies=orjson.loads(Path(cookies).read_bytes()), follow_redirects=True, proxies=self.proxies)
_session.headers.update(get_headers(_session))
return _session

Expand Down
11 changes: 6 additions & 5 deletions twitter/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,20 @@


class Search:
def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, **kwargs):
def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, proxies=None, **kwargs):
self.save = kwargs.get('save', True)
self.debug = kwargs.get('debug', 0)
self.logger = self._init_logger(**kwargs)
self.session = self._validate_session(email, username, password, session, **kwargs)
self.proxies = proxies
self.session = self._validate_session(email, username, password, session, proxies=proxies, **kwargs)

def run(self, queries: list[dict], limit: int = math.inf, out: str = 'data/search_results', **kwargs):
out = Path(out)
out.mkdir(parents=True, exist_ok=True)
return asyncio.run(self.process(queries, limit, out, **kwargs))

async def process(self, queries: list[dict], limit: int, out: Path, **kwargs) -> list:
async with AsyncClient(headers=get_headers(self.session)) as s:
async with AsyncClient(headers=get_headers(self.session), proxies=self.proxies) as s:
return await asyncio.gather(*(self.paginate(s, q, limit, out, **kwargs) for q in queries))

async def paginate(self, client: AsyncClient, query: dict, limit: int, out: Path, **kwargs) -> list[dict]:
Expand Down Expand Up @@ -147,13 +148,13 @@ def _validate_session(*args, **kwargs):

# try validating cookies dict
if isinstance(cookies, dict) and all(cookies.get(c) for c in {'ct0', 'auth_token'}):
_session = Client(cookies=cookies, follow_redirects=True)
_session = Client(cookies=cookies, follow_redirects=True, proxies=self.proxies)
_session.headers.update(get_headers(_session))
return _session

# try validating cookies from file
if isinstance(cookies, str):
_session = Client(cookies=orjson.loads(Path(cookies).read_bytes()), follow_redirects=True)
_session = Client(cookies=orjson.loads(Path(cookies).read_bytes()), follow_redirects=True, proxies=self.proxies)
_session.headers.update(get_headers(_session))
return _session

Expand Down