From 4c5e5b080951f4909ec4e03b122acac1fe704d85 Mon Sep 17 00:00:00 2001 From: matthew Date: Fri, 24 May 2024 18:09:24 +0300 Subject: [PATCH 01/18] Skeleton for Restoring context middleware --- scrapypuppeteer/middleware.py | 122 ++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index d6aab22..b6cc4a6 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -1,3 +1,4 @@ +import http import json import logging from collections import defaultdict @@ -371,3 +372,124 @@ def __is_closing(self, response, if close_page and remove_request: self._page_closing.remove(main_request) return close_page + + +class PuppeteerContextRecoveryDownloaderMiddleware: # TODO: change name? + """ + This middleware allows you to recover puppeteer context. + + If you want to recover puppeteer context starting from the specified first request provide + `recover_context` meta-key with `True` value. + + The middleware uses additionally these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + ... + + Settings: + + N_RECOVERY: int = 1 - number of recoverable requests + """ + + """ + WORK SCHEME: + + cases: + 1.) First PptrReq (without Context), Its response is good. After some request-response sequence it fails. Trying to recover it N times. + 2.) First PptrReq (without Context), Its response is bad. We need to try to recover it N times. + + For recovering we use context. If we have it we get first request in sequence and trying to recover everything from the beginning. + If we don't have it then we can send the request One more time in process_response until we get it. + """ + + N_RECOVERY_SETTING = "N_RECOVERY" + + def __init__(self, n_recovery): + self.n_recovery = n_recovery + self.context_requests = {} + self.context_counters = {} + + @classmethod + def from_crawler(cls, crawler: Crawler): + n_recovery = crawler.settings.get(cls.N_RECOVERY_SETTING, 1) + if not isinstance(n_recovery, int): + raise TypeError(f"`n_recovery` must be an integer, got {type(n_recovery)}") + elif n_recovery < 1: + raise ValueError("`n_recovery` must be greater than or equal to 1") + return cls(n_recovery) + + @staticmethod + def process_request(request, spider): + if not isinstance(request, PuppeteerRequest): + return None + + if not request.meta.pop('recover_context', False): + return None + + if request.context_id or request.page_id: + raise IgnoreRequest(f"Request {request} is not in the beginning of the request-response sequence") + + print("HERE 6!!!") + request.meta['__request_binding'] = True + return None + + def process_response(self, request, response, spider): + puppeteer_request = request.meta.get('puppeteer_request', None) + __request_binding = puppeteer_request.meta.get('__request_binding', False) if puppeteer_request is not None else None + if isinstance(response, PuppeteerResponse): + if __request_binding: + print("HERE 5!!!") + request.dont_filter = True + request.meta['__restore_count'] = 0 + self.context_requests[response.context_id] = request + self.context_counters[response.context_id] = 1 + return response + else: + # everything is OK + if response.context_id in self.context_counters: + self.context_counters[response.context_id] += 1 + return response + elif puppeteer_request is not None: + print("HERE 1!!!") + # There is an error + if response.status == 422: + print("HERE 2!!!") + # Corrupted context + if __request_binding: + # We did not get context + if request.meta.get('__restore_count', 0) < 1: + request.dont_filter = True + request.meta['__restore_count'] = 1 + return request + else: + # No more restoring + return response + else: + # We probably know this sequence + print("HERE 3!!!") + context_id = json.loads(response.text).get('contextId') + if context_id in self.context_requests: # TODO: context_id is updating after it restarts!!! + # We know this sequence + if self.context_counters[context_id] <= self.n_recovery: + restoring_request = self.context_requests[context_id] + if restoring_request.meta['__restore_count'] < 5: + # Restoring! + print("HERE 4!!!") + restoring_request.meta['__restore_count'] += 1 + print(f"Restoring the request {restoring_request}") + self.context_counters[context_id] = 1 + return restoring_request + else: + # No more restoring + return response + else: + # We cannot restore the sequence as it is too long + del self.context_counters[context_id] + del self.context_requests[context_id] + return response + else: + # We cannot restore this sequence as we don't know id + return response + else: + # some other error + return response + return response From e9f5f739cca7dbdcfaa3e96b6e602ee50f743a30 Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 28 May 2024 09:37:59 +0300 Subject: [PATCH 02/18] TODOs and structural changes --- scrapypuppeteer/middleware.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index b6cc4a6..dce7e53 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -374,7 +374,7 @@ def __is_closing(self, response, return close_page -class PuppeteerContextRecoveryDownloaderMiddleware: # TODO: change name? +class PuppeteerContextRestoreDownloaderMiddleware: """ This middleware allows you to recover puppeteer context. @@ -432,15 +432,19 @@ def process_request(request, spider): request.meta['__request_binding'] = True return None - def process_response(self, request, response, spider): + def process_response(self, request: Request, response, spider): puppeteer_request = request.meta.get('puppeteer_request', None) - __request_binding = puppeteer_request.meta.get('__request_binding', False) if puppeteer_request is not None else None + __request_binding = puppeteer_request.meta.get('__request_binding', False) if puppeteer_request is not None else None # TODO: to fix NoneType AttributeError if isinstance(response, PuppeteerResponse): if __request_binding: + restoring_request = request.copy() + # TODO: here we need to add meta-key `__original_context_id` + # (or smth like this) in order to distinguish when context print("HERE 5!!!") - request.dont_filter = True - request.meta['__restore_count'] = 0 - self.context_requests[response.context_id] = request + restoring_request.dont_filter = True + restoring_request.meta['__restore_count'] = 0 + restoring_request.meta['__context_id'] = response.context_id + self.context_requests[response.context_id] = restoring_request self.context_counters[response.context_id] = 1 return response else: @@ -466,7 +470,7 @@ def process_response(self, request, response, spider): else: # We probably know this sequence print("HERE 3!!!") - context_id = json.loads(response.text).get('contextId') + context_id = json.loads(response.text).get('contextId') # TODO: to check if context_id is not None! if context_id in self.context_requests: # TODO: context_id is updating after it restarts!!! # We know this sequence if self.context_counters[context_id] <= self.n_recovery: From 19d4c27abf6b9dc54b0f36bae755c9a49085f792 Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 28 May 2024 15:05:12 +0300 Subject: [PATCH 03/18] Fix: another context_id appearing --- scrapypuppeteer/middleware.py | 79 ++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index dce7e53..c37a658 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -275,8 +275,7 @@ def from_crawler(cls, crawler: Crawler): if isinstance(submit_selector, str): submit_selectors[key] = Click(selector=submit_selector) elif not isinstance(submit_selector, Click): - raise ValueError("Submit selector must be str or Click," - f"but {type(submit_selector)} provided") + raise TypeError(f"Submit selector must be str or Click, got {type(submit_selector)}") return cls(recaptcha_solving, submit_selectors) def process_request(self, request, spider): @@ -387,7 +386,8 @@ class PuppeteerContextRestoreDownloaderMiddleware: Settings: - N_RECOVERY: int = 1 - number of recoverable requests + RESTORING_LENGTH: int = 1 - number of restorable requests in a sequence. + N_RETRY_RESTORING: int = 1 - number of tries to restore a context. """ """ @@ -401,21 +401,30 @@ class PuppeteerContextRestoreDownloaderMiddleware: If we don't have it then we can send the request One more time in process_response until we get it. """ - N_RECOVERY_SETTING = "N_RECOVERY" + N_RETRY_RESTORING_SETTING = "N_RETRY_RESTORING" + RESTORING_LENGTH_SETTING = "RESTORING_LENGTH" - def __init__(self, n_recovery): + def __init__(self, n_recovery: int, n_retry_restoring: int): self.n_recovery = n_recovery + self.n_retry_restoring = n_retry_restoring self.context_requests = {} self.context_counters = {} @classmethod def from_crawler(cls, crawler: Crawler): - n_recovery = crawler.settings.get(cls.N_RECOVERY_SETTING, 1) - if not isinstance(n_recovery, int): - raise TypeError(f"`n_recovery` must be an integer, got {type(n_recovery)}") - elif n_recovery < 1: + restoring_length = crawler.settings.get(cls.RESTORING_LENGTH_SETTING, 1) + if not isinstance(restoring_length, int): + raise TypeError(f"`n_recovery` must be an integer, got {type(restoring_length)}") + elif restoring_length < 1: raise ValueError("`n_recovery` must be greater than or equal to 1") - return cls(n_recovery) + + n_retry_restoring = crawler.settings.get(cls.N_RETRY_RESTORING_SETTING, 1) + if not isinstance(n_retry_restoring, int): + raise TypeError(f"`n_recovery` must be an integer, got {type(n_retry_restoring)}") + elif n_retry_restoring < 1: + raise ValueError("`n_recovery` must be greater than or equal to 1") + + return cls(restoring_length, n_retry_restoring) @staticmethod def process_request(request, spider): @@ -430,23 +439,36 @@ def process_request(request, spider): print("HERE 6!!!") request.meta['__request_binding'] = True + request.dont_filter = True return None def process_response(self, request: Request, response, spider): - puppeteer_request = request.meta.get('puppeteer_request', None) - __request_binding = puppeteer_request.meta.get('__request_binding', False) if puppeteer_request is not None else None # TODO: to fix NoneType AttributeError + puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get('puppeteer_request', None) + # __request_binding = puppeteer_request.meta.get('__request_binding', False) if puppeteer_request is not None else None # TODO: to fix NoneType AttributeError + __request_binding = puppeteer_request and puppeteer_request.meta.get('__request_binding', False) if isinstance(response, PuppeteerResponse): if __request_binding: - restoring_request = request.copy() - # TODO: here we need to add meta-key `__original_context_id` - # (or smth like this) in order to distinguish when context - print("HERE 5!!!") - restoring_request.dont_filter = True - restoring_request.meta['__restore_count'] = 0 - restoring_request.meta['__context_id'] = response.context_id - self.context_requests[response.context_id] = restoring_request - self.context_counters[response.context_id] = 1 - return response + if request.meta.get('__context_id', None) is not None: + # Restoring corrupted context + print("HERE 7!!!") + restoring_request = request.copy() + old_context_id = restoring_request.meta['__context_id'] + restoring_request.meta['__context_id'] = response.context_id + self.context_requests[response.context_id] = restoring_request + self.context_counters[response.context_id] = 1 + del self.context_requests[old_context_id] + del self.context_counters[old_context_id] + return response + else: + # Just first request-response in the sequence + restoring_request = request.copy() + print("HERE 5!!!") + restoring_request.dont_filter = True + restoring_request.meta['__restore_count'] = 0 + restoring_request.meta['__context_id'] = response.context_id + self.context_requests[response.context_id] = restoring_request + self.context_counters[response.context_id] = 1 + return response else: # everything is OK if response.context_id in self.context_counters: @@ -461,8 +483,7 @@ def process_response(self, request: Request, response, spider): if __request_binding: # We did not get context if request.meta.get('__restore_count', 0) < 1: - request.dont_filter = True - request.meta['__restore_count'] = 1 + request.meta['__restore_count'] += 1 return request else: # No more restoring @@ -470,12 +491,12 @@ def process_response(self, request: Request, response, spider): else: # We probably know this sequence print("HERE 3!!!") - context_id = json.loads(response.text).get('contextId') # TODO: to check if context_id is not None! - if context_id in self.context_requests: # TODO: context_id is updating after it restarts!!! + context_id = json.loads(response.text).get('contextId') + if context_id in self.context_requests: # We know this sequence - if self.context_counters[context_id] <= self.n_recovery: + if self.context_counters[context_id] < self.n_recovery: restoring_request = self.context_requests[context_id] - if restoring_request.meta['__restore_count'] < 5: + if restoring_request.meta['__restore_count'] < 3: # Restoring! print("HERE 4!!!") restoring_request.meta['__restore_count'] += 1 @@ -486,6 +507,8 @@ def process_response(self, request: Request, response, spider): # No more restoring return response else: + print("HERE 8!!!") + print("N_RECOVERY number is exceeded!") # We cannot restore the sequence as it is too long del self.context_counters[context_id] del self.context_requests[context_id] From ef3af62ac9f9ed8dac0e7aa0e393bad76e8219ed Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 28 May 2024 15:44:13 +0300 Subject: [PATCH 04/18] Naming --- scrapypuppeteer/middleware.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index c37a658..f5328e8 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -404,8 +404,8 @@ class PuppeteerContextRestoreDownloaderMiddleware: N_RETRY_RESTORING_SETTING = "N_RETRY_RESTORING" RESTORING_LENGTH_SETTING = "RESTORING_LENGTH" - def __init__(self, n_recovery: int, n_retry_restoring: int): - self.n_recovery = n_recovery + def __init__(self, restoring_length: int, n_retry_restoring: int): + self.restoring_length = restoring_length self.n_retry_restoring = n_retry_restoring self.context_requests = {} self.context_counters = {} @@ -414,15 +414,15 @@ def __init__(self, n_recovery: int, n_retry_restoring: int): def from_crawler(cls, crawler: Crawler): restoring_length = crawler.settings.get(cls.RESTORING_LENGTH_SETTING, 1) if not isinstance(restoring_length, int): - raise TypeError(f"`n_recovery` must be an integer, got {type(restoring_length)}") + raise TypeError(f"`{cls.RESTORING_LENGTH_SETTING}` must be an integer, got {type(restoring_length)}") elif restoring_length < 1: - raise ValueError("`n_recovery` must be greater than or equal to 1") + raise ValueError(f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1") n_retry_restoring = crawler.settings.get(cls.N_RETRY_RESTORING_SETTING, 1) if not isinstance(n_retry_restoring, int): - raise TypeError(f"`n_recovery` must be an integer, got {type(n_retry_restoring)}") + raise TypeError(f"`{cls.N_RETRY_RESTORING_SETTING}` must be an integer, got {type(n_retry_restoring)}") elif n_retry_restoring < 1: - raise ValueError("`n_recovery` must be greater than or equal to 1") + raise ValueError(f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1") return cls(restoring_length, n_retry_restoring) @@ -444,7 +444,6 @@ def process_request(request, spider): def process_response(self, request: Request, response, spider): puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get('puppeteer_request', None) - # __request_binding = puppeteer_request.meta.get('__request_binding', False) if puppeteer_request is not None else None # TODO: to fix NoneType AttributeError __request_binding = puppeteer_request and puppeteer_request.meta.get('__request_binding', False) if isinstance(response, PuppeteerResponse): if __request_binding: @@ -453,11 +452,11 @@ def process_response(self, request: Request, response, spider): print("HERE 7!!!") restoring_request = request.copy() old_context_id = restoring_request.meta['__context_id'] + del self.context_requests[old_context_id] + del self.context_counters[old_context_id] restoring_request.meta['__context_id'] = response.context_id self.context_requests[response.context_id] = restoring_request self.context_counters[response.context_id] = 1 - del self.context_requests[old_context_id] - del self.context_counters[old_context_id] return response else: # Just first request-response in the sequence @@ -494,9 +493,9 @@ def process_response(self, request: Request, response, spider): context_id = json.loads(response.text).get('contextId') if context_id in self.context_requests: # We know this sequence - if self.context_counters[context_id] < self.n_recovery: + if self.context_counters[context_id] <= self.restoring_length: restoring_request = self.context_requests[context_id] - if restoring_request.meta['__restore_count'] < 3: + if restoring_request.meta['__restore_count'] < self.n_retry_restoring: # Restoring! print("HERE 4!!!") restoring_request.meta['__restore_count'] += 1 @@ -504,11 +503,13 @@ def process_response(self, request: Request, response, spider): self.context_counters[context_id] = 1 return restoring_request else: + print("HERE 9!!!") + print(f"`{self.N_RETRY_RESTORING_SETTING}` number is exceeded!") # No more restoring return response else: print("HERE 8!!!") - print("N_RECOVERY number is exceeded!") + print(f"`{self.RESTORING_LENGTH_SETTING}` number is exceeded!") # We cannot restore the sequence as it is too long del self.context_counters[context_id] del self.context_requests[context_id] From 35c6faddc10d8e2fa56bf9d2c13526a2420b4250 Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 28 May 2024 18:16:06 +0300 Subject: [PATCH 05/18] Simplified structure --- scrapypuppeteer/middleware.py | 131 +++++++++++++++------------------- 1 file changed, 56 insertions(+), 75 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index f5328e8..be5ef44 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -11,7 +11,8 @@ from scrapy.http import Headers, TextResponse from scrapypuppeteer.actions import Click, GoBack, GoForward, GoTo, RecaptchaSolver, Screenshot, Scroll, CustomJsAction -from scrapypuppeteer.response import PuppeteerResponse, PuppeteerHtmlResponse, PuppeteerScreenshotResponse, PuppeteerJsonResponse +from scrapypuppeteer.response import PuppeteerResponse, PuppeteerHtmlResponse, PuppeteerScreenshotResponse, \ + PuppeteerJsonResponse from scrapypuppeteer.request import ActionRequest, PuppeteerRequest @@ -445,79 +446,59 @@ def process_request(request, spider): def process_response(self, request: Request, response, spider): puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get('puppeteer_request', None) __request_binding = puppeteer_request and puppeteer_request.meta.get('__request_binding', False) + if isinstance(response, PuppeteerResponse): - if __request_binding: - if request.meta.get('__context_id', None) is not None: - # Restoring corrupted context - print("HERE 7!!!") - restoring_request = request.copy() - old_context_id = restoring_request.meta['__context_id'] - del self.context_requests[old_context_id] - del self.context_counters[old_context_id] - restoring_request.meta['__context_id'] = response.context_id - self.context_requests[response.context_id] = restoring_request - self.context_counters[response.context_id] = 1 - return response - else: - # Just first request-response in the sequence - restoring_request = request.copy() - print("HERE 5!!!") - restoring_request.dont_filter = True - restoring_request.meta['__restore_count'] = 0 - restoring_request.meta['__context_id'] = response.context_id - self.context_requests[response.context_id] = restoring_request - self.context_counters[response.context_id] = 1 - return response - else: - # everything is OK - if response.context_id in self.context_counters: - self.context_counters[response.context_id] += 1 - return response - elif puppeteer_request is not None: - print("HERE 1!!!") - # There is an error - if response.status == 422: - print("HERE 2!!!") - # Corrupted context - if __request_binding: - # We did not get context - if request.meta.get('__restore_count', 0) < 1: - request.meta['__restore_count'] += 1 - return request - else: - # No more restoring - return response - else: - # We probably know this sequence - print("HERE 3!!!") - context_id = json.loads(response.text).get('contextId') - if context_id in self.context_requests: - # We know this sequence - if self.context_counters[context_id] <= self.restoring_length: - restoring_request = self.context_requests[context_id] - if restoring_request.meta['__restore_count'] < self.n_retry_restoring: - # Restoring! - print("HERE 4!!!") - restoring_request.meta['__restore_count'] += 1 - print(f"Restoring the request {restoring_request}") - self.context_counters[context_id] = 1 - return restoring_request - else: - print("HERE 9!!!") - print(f"`{self.N_RETRY_RESTORING_SETTING}` number is exceeded!") - # No more restoring - return response - else: - print("HERE 8!!!") - print(f"`{self.RESTORING_LENGTH_SETTING}` number is exceeded!") - # We cannot restore the sequence as it is too long - del self.context_counters[context_id] - del self.context_requests[context_id] - return response - else: - # We cannot restore this sequence as we don't know id - return response - else: - # some other error - return response + self._bind_context(request, response, __request_binding) + elif puppeteer_request is not None and response.status == 422: + # Corrupted context + return self._restore_context(request, response, __request_binding) + return response + + def _bind_context(self, + request, response, + __request_binding): + if __request_binding: + if request.meta.get('__context_id', None) is not None: + print("HERE 7!!!") + # Restoring corrupted context + old_context_id = request.meta['__context_id'] + del self.context_requests[old_context_id] + del self.context_counters[old_context_id] + restoring_request = request.copy() + restoring_request.meta['__restore_count'] = restoring_request.meta.get('__restore_count', + 0) # TODO: can we use just meta instead of self.(...)? + restoring_request.meta['__context_id'] = response.context_id + self.context_requests[response.context_id] = restoring_request + self.context_counters[response.context_id] = 0 + # everything is OK + if response.context_id in self.context_counters: # TODO: I don't like this code here. + self.context_counters[response.context_id] += 1 + + def _restore_context(self, + request, response, + __request_binding): + if __request_binding: # TODO: this is not restoring context. This is binding. + # We did not get context + if request.meta.get('__restore_count', 0) < self.n_retry_restoring: + request.meta['__restore_count'] += 1 + return request + else: + # We probably know this sequence + print("HERE 3!!!") + context_id = json.loads(response.text).get('contextId', None) + if context_id in self.context_requests: + # We know this sequence + restoring_request = self.context_requests[context_id] + if self.context_counters[context_id] <= self.restoring_length and \ + restoring_request.meta['__restore_count'] < self.n_retry_restoring: + # Restoring! + print("HERE 4!!!") + restoring_request.meta['__restore_count'] += 1 + print(f"Restoring the request {restoring_request}") + self.context_counters[context_id] = 1 + return restoring_request + else: # TODO: to determine the reason of disability to restore the sequence. + # We cannot restore the sequence as it is too long + del self.context_counters[context_id] + del self.context_requests[context_id] return response From afc5af2470f2851cf28dbca6b2b27fbe3bef656a Mon Sep 17 00:00:00 2001 From: matthew Date: Wed, 29 May 2024 14:54:50 +0300 Subject: [PATCH 06/18] Refactored middleware --- scrapypuppeteer/middleware.py | 104 ++++++++++++++++------------------ 1 file changed, 50 insertions(+), 54 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index be5ef44..b1f0198 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -1,9 +1,10 @@ -import http import json import logging + from collections import defaultdict from typing import List, Union from urllib.parse import urlencode, urljoin +from http import HTTPStatus from scrapy import Request, signals from scrapy.crawler import Crawler @@ -438,67 +439,62 @@ def process_request(request, spider): if request.context_id or request.page_id: raise IgnoreRequest(f"Request {request} is not in the beginning of the request-response sequence") - print("HERE 6!!!") request.meta['__request_binding'] = True request.dont_filter = True return None def process_response(self, request: Request, response, spider): puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get('puppeteer_request', None) - __request_binding = puppeteer_request and puppeteer_request.meta.get('__request_binding', False) + request_binding = puppeteer_request is not None and puppeteer_request.meta.get('__request_binding', False) # TODO: it's too difficult if isinstance(response, PuppeteerResponse): - self._bind_context(request, response, __request_binding) - elif puppeteer_request is not None and response.status == 422: - # Corrupted context - return self._restore_context(request, response, __request_binding) + if request_binding: + self._bind_context(request, response) + if response.context_id in self.context_counters: + # Update number of actions in context + self.context_counters[response.context_id] += 1 + elif puppeteer_request is not None and response.status == HTTPStatus.UNPROCESSABLE_ENTITY: + # One PuppeteerRequest has failed with 422 error + if request_binding: + # Could not get context, retry + if request.meta.get('__restore_count', 0) < self.n_retry_restoring: + request.meta['__restore_count'] += 1 + return request + else: + return self._restore_context(response) return response - def _bind_context(self, - request, response, - __request_binding): - if __request_binding: - if request.meta.get('__context_id', None) is not None: - print("HERE 7!!!") - # Restoring corrupted context - old_context_id = request.meta['__context_id'] - del self.context_requests[old_context_id] - del self.context_counters[old_context_id] - restoring_request = request.copy() - restoring_request.meta['__restore_count'] = restoring_request.meta.get('__restore_count', - 0) # TODO: can we use just meta instead of self.(...)? - restoring_request.meta['__context_id'] = response.context_id - self.context_requests[response.context_id] = restoring_request - self.context_counters[response.context_id] = 0 - # everything is OK - if response.context_id in self.context_counters: # TODO: I don't like this code here. - self.context_counters[response.context_id] += 1 - - def _restore_context(self, - request, response, - __request_binding): - if __request_binding: # TODO: this is not restoring context. This is binding. - # We did not get context - if request.meta.get('__restore_count', 0) < self.n_retry_restoring: - request.meta['__restore_count'] += 1 - return request - else: - # We probably know this sequence - print("HERE 3!!!") - context_id = json.loads(response.text).get('contextId', None) - if context_id in self.context_requests: - # We know this sequence - restoring_request = self.context_requests[context_id] - if self.context_counters[context_id] <= self.restoring_length and \ - restoring_request.meta['__restore_count'] < self.n_retry_restoring: - # Restoring! - print("HERE 4!!!") - restoring_request.meta['__restore_count'] += 1 - print(f"Restoring the request {restoring_request}") - self.context_counters[context_id] = 1 - return restoring_request - else: # TODO: to determine the reason of disability to restore the sequence. - # We cannot restore the sequence as it is too long - del self.context_counters[context_id] - del self.context_requests[context_id] + def _bind_context(self, request, response): + if request.meta.get('__context_id', None) is not None: + # Need to update context_id + self.__delete_context(request.meta['__context_id'], "DELETING OLD CONTEXT") + restoring_request = request.copy() + restoring_request.meta['__restore_count'] = restoring_request.meta.get('__restore_count', 0) + restoring_request.meta['__context_id'] = response.context_id + self.context_requests[response.context_id] = restoring_request + self.context_counters[response.context_id] = 0 + + def _restore_context(self, response): + context_id = json.loads(response.text).get('contextId', None) + + if context_id in self.context_requests: + restoring_request = self.context_requests[context_id] + + if self.context_counters[context_id] > self.restoring_length: # TODO: not informative variables + # Too many actions in context + self.__delete_context(context_id, "TOO MANY ACTIONS IN CONTEXT") + elif restoring_request.meta['__restore_count'] >= self.n_retry_restoring: # TODO: try to fix the > and >= (why not the same???) + # Too many retries + self.__delete_context(context_id, "TOO MANY RETRIES") + else: + # Restoring + restoring_request.meta['__restore_count'] += 1 + print(f"Restoring the request {restoring_request}") # TODO: to make logging + self.context_counters[context_id] = 1 + return restoring_request return response + + def __delete_context(self, context_id: str, reason: str): + del self.context_counters[context_id] + del self.context_requests[context_id] + print(reason) From bc2f698c8345f70e055ca08a5feca4070d0cbedd Mon Sep 17 00:00:00 2001 From: matthew Date: Wed, 29 May 2024 16:43:00 +0300 Subject: [PATCH 07/18] Middleware is done --- scrapypuppeteer/middleware.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index b1f0198..59f3ad8 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -410,7 +410,7 @@ def __init__(self, restoring_length: int, n_retry_restoring: int): self.restoring_length = restoring_length self.n_retry_restoring = n_retry_restoring self.context_requests = {} - self.context_counters = {} + self.context_length = {} @classmethod def from_crawler(cls, crawler: Crawler): @@ -450,9 +450,9 @@ def process_response(self, request: Request, response, spider): if isinstance(response, PuppeteerResponse): if request_binding: self._bind_context(request, response) - if response.context_id in self.context_counters: + if response.context_id in self.context_length: # Update number of actions in context - self.context_counters[response.context_id] += 1 + self.context_length[response.context_id] += 1 elif puppeteer_request is not None and response.status == HTTPStatus.UNPROCESSABLE_ENTITY: # One PuppeteerRequest has failed with 422 error if request_binding: @@ -472,7 +472,7 @@ def _bind_context(self, request, response): restoring_request.meta['__restore_count'] = restoring_request.meta.get('__restore_count', 0) restoring_request.meta['__context_id'] = response.context_id self.context_requests[response.context_id] = restoring_request - self.context_counters[response.context_id] = 0 + self.context_length[response.context_id] = 0 def _restore_context(self, response): context_id = json.loads(response.text).get('contextId', None) @@ -480,21 +480,21 @@ def _restore_context(self, response): if context_id in self.context_requests: restoring_request = self.context_requests[context_id] - if self.context_counters[context_id] > self.restoring_length: # TODO: not informative variables + if self.context_length[context_id] >= self.restoring_length + 1: # Too many actions in context self.__delete_context(context_id, "TOO MANY ACTIONS IN CONTEXT") - elif restoring_request.meta['__restore_count'] >= self.n_retry_restoring: # TODO: try to fix the > and >= (why not the same???) + elif restoring_request.meta['__restore_count'] >= self.n_retry_restoring: # Too many retries self.__delete_context(context_id, "TOO MANY RETRIES") else: # Restoring restoring_request.meta['__restore_count'] += 1 print(f"Restoring the request {restoring_request}") # TODO: to make logging - self.context_counters[context_id] = 1 + self.context_length[context_id] = 1 return restoring_request return response def __delete_context(self, context_id: str, reason: str): - del self.context_counters[context_id] + del self.context_length[context_id] del self.context_requests[context_id] print(reason) From 50ff7ce15dfdae2c0090cfa8f46258415d647304 Mon Sep 17 00:00:00 2001 From: matthew Date: Wed, 29 May 2024 17:44:34 +0300 Subject: [PATCH 08/18] Restructuring project --- scrapypuppeteer/middleware.py | 500 ------------------ scrapypuppeteer/middleware/__init__.py | 3 + .../middleware/recaptcha_middleware.py | 175 ++++++ .../middleware/restore_middleware.py | 144 +++++ .../middleware/service_middleware.py | 208 ++++++++ 5 files changed, 530 insertions(+), 500 deletions(-) delete mode 100644 scrapypuppeteer/middleware.py create mode 100644 scrapypuppeteer/middleware/__init__.py create mode 100644 scrapypuppeteer/middleware/recaptcha_middleware.py create mode 100644 scrapypuppeteer/middleware/restore_middleware.py create mode 100644 scrapypuppeteer/middleware/service_middleware.py diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py deleted file mode 100644 index 59f3ad8..0000000 --- a/scrapypuppeteer/middleware.py +++ /dev/null @@ -1,500 +0,0 @@ -import json -import logging - -from collections import defaultdict -from typing import List, Union -from urllib.parse import urlencode, urljoin -from http import HTTPStatus - -from scrapy import Request, signals -from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest, NotConfigured -from scrapy.http import Headers, TextResponse - -from scrapypuppeteer.actions import Click, GoBack, GoForward, GoTo, RecaptchaSolver, Screenshot, Scroll, CustomJsAction -from scrapypuppeteer.response import PuppeteerResponse, PuppeteerHtmlResponse, PuppeteerScreenshotResponse, \ - PuppeteerJsonResponse -from scrapypuppeteer.request import ActionRequest, PuppeteerRequest - - -class PuppeteerServiceDownloaderMiddleware: - """ - This downloader middleware converts PuppeteerRequest instances to - Puppeteer service API requests and then converts its responses to - PuppeteerResponse instances. Additionally, it tracks all browser contexts - that spider uses and performs cleanup request to service once spider - is closed. - - Additionally, the middleware uses these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' - - Settings: - - PUPPETEER_SERVICE_URL (str) - Service URL, e.g. 'http://localhost:3000' - - PUPPETEER_INCLUDE_HEADERS (bool|list[str]) - Determines which request headers will be sent to remote site by puppeteer service. - Either True (all headers), False (no headers) or list of header names. - May be overriden per request. - By default, only cookies are sent. - - PUPPETEER_INCLUDE_META (bool) - Determines whether to send or not user's meta attached by user. - Default to False. - """ - - SERVICE_URL_SETTING = 'PUPPETEER_SERVICE_URL' - INCLUDE_HEADERS_SETTING = 'PUPPETEER_INCLUDE_HEADERS' - SERVICE_META_SETTING = 'PUPPETEER_INCLUDE_META' - DEFAULT_INCLUDE_HEADERS = ['Cookie'] # TODO send them separately - - def __init__(self, - crawler: Crawler, - service_url: str, - include_headers: Union[bool, List[str]], - include_meta: bool): - self.service_base_url = service_url - self.include_headers = include_headers - self.include_meta = include_meta - self.crawler = crawler - self.used_contexts = defaultdict(set) - - @classmethod - def from_crawler(cls, crawler): - service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) - if service_url is None: - raise ValueError('Puppeteer service URL must be provided') - if cls.INCLUDE_HEADERS_SETTING in crawler.settings: - try: - include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) - except ValueError: - include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) - else: - include_headers = cls.DEFAULT_INCLUDE_HEADERS - include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) - middleware = cls(crawler, service_url, include_headers, include_meta) - crawler.signals.connect(middleware.close_used_contexts, - signal=signals.spider_closed) - return middleware - - def process_request(self, request, spider): - if not isinstance(request, PuppeteerRequest): - return - - action = request.action - service_url = urljoin(self.service_base_url, action.endpoint) - service_params = self._encode_service_params(request) - if service_params: - service_url += '?' + service_params - - meta = { - 'puppeteer_request': request, - 'dont_obey_robotstxt': True, - 'proxy': None - } - if self.include_meta: - meta = { - **request.meta, - **meta - } - - return ActionRequest( - url=service_url, - action=action, - method='POST', - headers=Headers({'Content-Type': action.content_type}), - body=self._serialize_body(action, request), - dont_filter=True, - cookies=request.cookies, - priority=request.priority, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta=meta - ) - - @staticmethod - def _encode_service_params(request): - service_params = {} - if request.context_id is not None: - service_params['contextId'] = request.context_id - if request.page_id is not None: - service_params['pageId'] = request.page_id - if request.close_page: - service_params['closePage'] = 1 - return urlencode(service_params) - - def _serialize_body(self, action, request): - payload = action.payload() - if action.content_type == 'application/json': - if isinstance(payload, dict): - # disallow null values in top-level request parameters - payload = {k: v for k, v in payload.items() if v is not None} - proxy = request.meta.get('proxy') - if proxy: - payload['proxy'] = proxy - include_headers = self.include_headers if request.include_headers is None else request.include_headers - if include_headers: - headers = request.headers.to_unicode_dict() - if isinstance(include_headers, list): - headers = {h.lower(): headers[h] for h in include_headers if h in headers} - payload['headers'] = headers - return json.dumps(payload) - return str(payload) - - def process_response(self, request, response, spider): - if not isinstance(response, TextResponse): - return response - - puppeteer_request = request.meta.get('puppeteer_request') - if puppeteer_request is None: - return response - - if b'application/json' not in response.headers.get(b'Content-Type', b''): - return response.replace(request=request) - - response_data = json.loads(response.text) - response_cls = self._get_response_class(puppeteer_request.action) - - if response.status != 200: - context_id = response_data.get('contextId') - if context_id: - self.used_contexts[id(spider)].add(context_id) - return response - - return self._form_response(response_cls, response_data, - puppeteer_request.url, request, puppeteer_request, - spider) - - def _form_response(self, response_cls, response_data, - url, request, puppeteer_request, - spider): - context_id = response_data.pop('contextId', puppeteer_request.context_id) - page_id = response_data.pop('pageId', puppeteer_request.page_id) - - attributes = dict() - for attr in response_cls.attributes: - if attr in response_data: - attributes[attr] = response_data.pop(attr) - if response_data: - attributes['data'] = response_data - - self.used_contexts[id(spider)].add(context_id) - - return response_cls( - url=url, - puppeteer_request=puppeteer_request, - context_id=context_id, - page_id=page_id, - request=request, - **attributes - ) - - @staticmethod - def _get_response_class(request_action): - if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)): - return PuppeteerHtmlResponse - if isinstance(request_action, Screenshot): - return PuppeteerScreenshotResponse - return PuppeteerJsonResponse - - def close_used_contexts(self, spider): - contexts = list(self.used_contexts[id(spider)]) - if contexts: - request = Request(urljoin(self.service_base_url, '/close_context'), - method='POST', - headers=Headers({'Content-Type': 'application/json'}), - meta={"proxy": None}, - body=json.dumps(contexts)) - return self.crawler.engine.downloader.fetch(request, None) - - -class PuppeteerRecaptchaDownloaderMiddleware: - """ - This middleware is supposed to solve recaptcha on the page automatically. - If there is no captcha on the page then this middleware will do nothing - on the page, so your 2captcha balance will remain the same. - It can submit recaptcha if "submit button" is provided. - It will not "submit" captcha if there is no submit-selector. - - If you want to turn Recaptcha solving off on the exact request provide - meta-key 'dont_recaptcha' with True value. The middleware will skip the request - through itself. - - The middleware uses additionally these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - '_captcha_submission', '_captcha_solving' - - Settings: - - RECAPTCHA_ACTIVATION: bool = True - activates or not the middleware (if not - raises NotConfigured) - RECAPTCHA_SOLVING: bool = True - whether solve captcha automatically or not - RECAPTCHA_SUBMIT_SELECTORS: str | dict = {} - dictionary consisting of domains and - these domains' submit selectors, e.g. - 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit' - it could be also squeezed to - 'ecaptcha/api2/de': '#recaptcha-demo-submit' - also you can use not just strings but Click actions with required parameters: - 'ogle.com/recaptcha': Click('#recaptcha-demo-submit') - In general - domain is a unique identifying string which is contained in web-page url - If there is no button to submit recaptcha then provide empty string to a domain. - This setting can also be a string. If so the middleware will only click the button - related to this selector. - This setting can also be unprovided. In this case every web-page you crawl is supposed to be - without submit button, or you manually do it yourself. - """ - - MIDDLEWARE_ACTIVATION_SETTING = "RECAPTCHA_ACTIVATION" - RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" - SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" - - def __init__(self, - recaptcha_solving: bool, - submit_selectors: dict): - self.submit_selectors = submit_selectors - self.recaptcha_solving = recaptcha_solving - self._page_responses = dict() - self._page_closing = set() - - @classmethod - def from_crawler(cls, crawler: Crawler): - activation = crawler.settings.get(cls.MIDDLEWARE_ACTIVATION_SETTING, True) - if not activation: - raise NotConfigured - recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) - - try: - submit_selectors = crawler.settings.getdict(cls.SUBMIT_SELECTORS_SETTING, dict()) - except ValueError: - submit_selectors = {'': crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, '')} - except Exception as exception: - raise ValueError(f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}") - - for key in submit_selectors.keys(): - submit_selector = submit_selectors[key] - if isinstance(submit_selector, str): - submit_selectors[key] = Click(selector=submit_selector) - elif not isinstance(submit_selector, Click): - raise TypeError(f"Submit selector must be str or Click, got {type(submit_selector)}") - return cls(recaptcha_solving, submit_selectors) - - def process_request(self, request, spider): - if request.meta.get('dont_recaptcha', False): - return None - - if isinstance(request, PuppeteerRequest): - if request.close_page and not request.meta.get('_captcha_submission', False): - request.close_page = False - request.dont_filter = True - self._page_closing.add(request) - return request - return None - - def process_response(self, - request, response, - spider): - if not isinstance(response, PuppeteerResponse): # We only work with PuppeteerResponses - return response - - puppeteer_request = response.puppeteer_request - if puppeteer_request.meta.get('dont_recaptcha', False): # Skip such responses - return response - - if puppeteer_request.meta.pop('_captcha_submission', False): # Submitted captcha - return self.__gen_response(response) - - if puppeteer_request.meta.pop('_captcha_solving', False): - # RECaptchaSolver was called by recaptcha middleware - return self._submit_recaptcha(request, response, spider) - - if isinstance(puppeteer_request.action, - (Screenshot, Scroll, CustomJsAction, RecaptchaSolver)): - # No recaptcha after this action - return response - - # Any puppeteer response besides RecaptchaSolver's PuppeteerResponse - return self._solve_recaptcha(request, response) - - def _solve_recaptcha(self, request, response): - self._page_responses[response.page_id] = response # Saving main response to return it later - - recaptcha_solver = RecaptchaSolver(solve_recaptcha=self.recaptcha_solving, - close_on_empty=self.__is_closing(response, remove_request=False)) - return response.follow(recaptcha_solver, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta={'_captcha_solving': True}, - close_page=False) - - def _submit_recaptcha(self, request, response, spider): - response_data = response.data - if not response.puppeteer_request.action.solve_recaptcha: - spider.log(message=f"Found {len(response_data['recaptcha_data']['captchas'])} captcha " - f"but did not solve due to argument", - level=logging.INFO) - return self.__gen_response(response) - # Click "submit button"? - if response_data['recaptcha_data']['captchas'] and self.submit_selectors: - # We need to click "submit button" - for domain, submitting in self.submit_selectors.items(): - if domain in response.url: - if not submitting.selector: - return self.__gen_response(response) - return response.follow(action=submitting, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - close_page=self.__is_closing(response), - meta={'_captcha_submission': True}) - raise IgnoreRequest("No submit selector found to click on the page but captcha found") - return self.__gen_response(response) - - def __gen_response(self, response): - main_response_data = dict() - main_response_data['page_id'] = None if self.__is_closing(response) else response.puppeteer_request.page_id - - main_response = self._page_responses.pop(response.page_id) - - if isinstance(main_response, PuppeteerHtmlResponse): - if isinstance(response.puppeteer_request.action, RecaptchaSolver): - main_response_data['body'] = response.data['html'] - elif isinstance(response.puppeteer_request.action, Click): - main_response_data['body'] = response.body - - return main_response.replace(**main_response_data) - - def __is_closing(self, response, - remove_request: bool = True) -> bool: - main_request = self._page_responses[response.page_id].puppeteer_request - close_page = main_request in self._page_closing - if close_page and remove_request: - self._page_closing.remove(main_request) - return close_page - - -class PuppeteerContextRestoreDownloaderMiddleware: - """ - This middleware allows you to recover puppeteer context. - - If you want to recover puppeteer context starting from the specified first request provide - `recover_context` meta-key with `True` value. - - The middleware uses additionally these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - ... - - Settings: - - RESTORING_LENGTH: int = 1 - number of restorable requests in a sequence. - N_RETRY_RESTORING: int = 1 - number of tries to restore a context. - """ - - """ - WORK SCHEME: - - cases: - 1.) First PptrReq (without Context), Its response is good. After some request-response sequence it fails. Trying to recover it N times. - 2.) First PptrReq (without Context), Its response is bad. We need to try to recover it N times. - - For recovering we use context. If we have it we get first request in sequence and trying to recover everything from the beginning. - If we don't have it then we can send the request One more time in process_response until we get it. - """ - - N_RETRY_RESTORING_SETTING = "N_RETRY_RESTORING" - RESTORING_LENGTH_SETTING = "RESTORING_LENGTH" - - def __init__(self, restoring_length: int, n_retry_restoring: int): - self.restoring_length = restoring_length - self.n_retry_restoring = n_retry_restoring - self.context_requests = {} - self.context_length = {} - - @classmethod - def from_crawler(cls, crawler: Crawler): - restoring_length = crawler.settings.get(cls.RESTORING_LENGTH_SETTING, 1) - if not isinstance(restoring_length, int): - raise TypeError(f"`{cls.RESTORING_LENGTH_SETTING}` must be an integer, got {type(restoring_length)}") - elif restoring_length < 1: - raise ValueError(f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1") - - n_retry_restoring = crawler.settings.get(cls.N_RETRY_RESTORING_SETTING, 1) - if not isinstance(n_retry_restoring, int): - raise TypeError(f"`{cls.N_RETRY_RESTORING_SETTING}` must be an integer, got {type(n_retry_restoring)}") - elif n_retry_restoring < 1: - raise ValueError(f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1") - - return cls(restoring_length, n_retry_restoring) - - @staticmethod - def process_request(request, spider): - if not isinstance(request, PuppeteerRequest): - return None - - if not request.meta.pop('recover_context', False): - return None - - if request.context_id or request.page_id: - raise IgnoreRequest(f"Request {request} is not in the beginning of the request-response sequence") - - request.meta['__request_binding'] = True - request.dont_filter = True - return None - - def process_response(self, request: Request, response, spider): - puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get('puppeteer_request', None) - request_binding = puppeteer_request is not None and puppeteer_request.meta.get('__request_binding', False) # TODO: it's too difficult - - if isinstance(response, PuppeteerResponse): - if request_binding: - self._bind_context(request, response) - if response.context_id in self.context_length: - # Update number of actions in context - self.context_length[response.context_id] += 1 - elif puppeteer_request is not None and response.status == HTTPStatus.UNPROCESSABLE_ENTITY: - # One PuppeteerRequest has failed with 422 error - if request_binding: - # Could not get context, retry - if request.meta.get('__restore_count', 0) < self.n_retry_restoring: - request.meta['__restore_count'] += 1 - return request - else: - return self._restore_context(response) - return response - - def _bind_context(self, request, response): - if request.meta.get('__context_id', None) is not None: - # Need to update context_id - self.__delete_context(request.meta['__context_id'], "DELETING OLD CONTEXT") - restoring_request = request.copy() - restoring_request.meta['__restore_count'] = restoring_request.meta.get('__restore_count', 0) - restoring_request.meta['__context_id'] = response.context_id - self.context_requests[response.context_id] = restoring_request - self.context_length[response.context_id] = 0 - - def _restore_context(self, response): - context_id = json.loads(response.text).get('contextId', None) - - if context_id in self.context_requests: - restoring_request = self.context_requests[context_id] - - if self.context_length[context_id] >= self.restoring_length + 1: - # Too many actions in context - self.__delete_context(context_id, "TOO MANY ACTIONS IN CONTEXT") - elif restoring_request.meta['__restore_count'] >= self.n_retry_restoring: - # Too many retries - self.__delete_context(context_id, "TOO MANY RETRIES") - else: - # Restoring - restoring_request.meta['__restore_count'] += 1 - print(f"Restoring the request {restoring_request}") # TODO: to make logging - self.context_length[context_id] = 1 - return restoring_request - return response - - def __delete_context(self, context_id: str, reason: str): - del self.context_length[context_id] - del self.context_requests[context_id] - print(reason) diff --git a/scrapypuppeteer/middleware/__init__.py b/scrapypuppeteer/middleware/__init__.py new file mode 100644 index 0000000..9d63a08 --- /dev/null +++ b/scrapypuppeteer/middleware/__init__.py @@ -0,0 +1,3 @@ +from .service_middleware import PuppeteerServiceDownloaderMiddleware +from .recaptcha_middleware import PuppeteerRecaptchaDownloaderMiddleware +from .restore_middleware import PuppeteerContextRestoreDownloaderMiddleware diff --git a/scrapypuppeteer/middleware/recaptcha_middleware.py b/scrapypuppeteer/middleware/recaptcha_middleware.py new file mode 100644 index 0000000..36013a3 --- /dev/null +++ b/scrapypuppeteer/middleware/recaptcha_middleware.py @@ -0,0 +1,175 @@ +import logging + +from scrapy.crawler import Crawler +from scrapy.exceptions import IgnoreRequest, NotConfigured + +from scrapypuppeteer.actions import Click, RecaptchaSolver, Screenshot, Scroll, CustomJsAction +from scrapypuppeteer.response import PuppeteerResponse, PuppeteerHtmlResponse +from scrapypuppeteer.request import PuppeteerRequest + +recaptcha_logger = logging.getLogger(__name__) + + +class PuppeteerRecaptchaDownloaderMiddleware: + """ + This middleware is supposed to solve recaptcha on the page automatically. + If there is no captcha on the page then this middleware will do nothing + on the page, so your 2captcha balance will remain the same. + It can submit recaptcha if "submit button" is provided. + It will not "submit" captcha if there is no submit-selector. + + If you want to turn Recaptcha solving off on the exact request provide + meta-key 'dont_recaptcha' with True value. The middleware will skip the request + through itself. + + The middleware uses additionally these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + '_captcha_submission', '_captcha_solving' + + Settings: + + RECAPTCHA_ACTIVATION: bool = True - activates or not the middleware (if not - raises NotConfigured) + RECAPTCHA_SOLVING: bool = True - whether solve captcha automatically or not + RECAPTCHA_SUBMIT_SELECTORS: str | dict = {} - dictionary consisting of domains and + these domains' submit selectors, e.g. + 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit' + it could be also squeezed to + 'ecaptcha/api2/de': '#recaptcha-demo-submit' + also you can use not just strings but Click actions with required parameters: + 'ogle.com/recaptcha': Click('#recaptcha-demo-submit') + In general - domain is a unique identifying string which is contained in web-page url + If there is no button to submit recaptcha then provide empty string to a domain. + This setting can also be a string. If so the middleware will only click the button + related to this selector. + This setting can also be unprovided. In this case every web-page you crawl is supposed to be + without submit button, or you manually do it yourself. + """ + + MIDDLEWARE_ACTIVATION_SETTING = "RECAPTCHA_ACTIVATION" + RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" + SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" + + def __init__(self, + recaptcha_solving: bool, + submit_selectors: dict): + self.submit_selectors = submit_selectors + self.recaptcha_solving = recaptcha_solving + self._page_responses = dict() + self._page_closing = set() + + @classmethod + def from_crawler(cls, crawler: Crawler): + activation = crawler.settings.get(cls.MIDDLEWARE_ACTIVATION_SETTING, True) + if not activation: + raise NotConfigured + recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) + + try: + submit_selectors = crawler.settings.getdict(cls.SUBMIT_SELECTORS_SETTING, dict()) + except ValueError: + submit_selectors = {'': crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, '')} + except Exception as exception: + raise ValueError(f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}") + + for key in submit_selectors.keys(): + submit_selector = submit_selectors[key] + if isinstance(submit_selector, str): + submit_selectors[key] = Click(selector=submit_selector) + elif not isinstance(submit_selector, Click): + raise TypeError(f"Submit selector must be str or Click, got {type(submit_selector)}") + return cls(recaptcha_solving, submit_selectors) + + def process_request(self, request, spider): + if request.meta.get('dont_recaptcha', False): + return None + + if isinstance(request, PuppeteerRequest): + if request.close_page and not request.meta.get('_captcha_submission', False): + request.close_page = False + request.dont_filter = True + self._page_closing.add(request) + return request + return None + + def process_response(self, + request, response, + spider): + if not isinstance(response, PuppeteerResponse): # We only work with PuppeteerResponses + return response + + puppeteer_request = response.puppeteer_request + if puppeteer_request.meta.get('dont_recaptcha', False): # Skip such responses + return response + + if puppeteer_request.meta.pop('_captcha_submission', False): # Submitted captcha + return self.__gen_response(response) + + if puppeteer_request.meta.pop('_captcha_solving', False): + # RECaptchaSolver was called by recaptcha middleware + return self._submit_recaptcha(request, response, spider) + + if isinstance(puppeteer_request.action, + (Screenshot, Scroll, CustomJsAction, RecaptchaSolver)): + # No recaptcha after this action + return response + + # Any puppeteer response besides RecaptchaSolver's PuppeteerResponse + return self._solve_recaptcha(request, response) + + def _solve_recaptcha(self, request, response): + self._page_responses[response.page_id] = response # Saving main response to return it later + + recaptcha_solver = RecaptchaSolver(solve_recaptcha=self.recaptcha_solving, + close_on_empty=self.__is_closing(response, remove_request=False)) + return response.follow(recaptcha_solver, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta={'_captcha_solving': True}, + close_page=False) + + def _submit_recaptcha(self, request, response, spider): + response_data = response.data + if not response.puppeteer_request.action.solve_recaptcha: + recaptcha_logger.log(level=logging.INFO, + msg=f"Found {len(response_data['recaptcha_data']['captchas'])} captcha " + f"but did not solve due to argument", + ) + return self.__gen_response(response) + # Click "submit button"? + if response_data['recaptcha_data']['captchas'] and self.submit_selectors: + # We need to click "submit button" + for domain, submitting in self.submit_selectors.items(): + if domain in response.url: + if not submitting.selector: + return self.__gen_response(response) + return response.follow(action=submitting, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + close_page=self.__is_closing(response), + meta={'_captcha_submission': True}) + raise IgnoreRequest("No submit selector found to click on the page but captcha found") + return self.__gen_response(response) + + def __gen_response(self, response): + main_response_data = dict() + main_response_data['page_id'] = None if self.__is_closing(response) else response.puppeteer_request.page_id + + main_response = self._page_responses.pop(response.page_id) + + if isinstance(main_response, PuppeteerHtmlResponse): + if isinstance(response.puppeteer_request.action, RecaptchaSolver): + main_response_data['body'] = response.data['html'] + elif isinstance(response.puppeteer_request.action, Click): + main_response_data['body'] = response.body + + return main_response.replace(**main_response_data) + + def __is_closing(self, response, + remove_request: bool = True) -> bool: + main_request = self._page_responses[response.page_id].puppeteer_request + close_page = main_request in self._page_closing + if close_page and remove_request: + self._page_closing.remove(main_request) + return close_page diff --git a/scrapypuppeteer/middleware/restore_middleware.py b/scrapypuppeteer/middleware/restore_middleware.py new file mode 100644 index 0000000..43db9f9 --- /dev/null +++ b/scrapypuppeteer/middleware/restore_middleware.py @@ -0,0 +1,144 @@ +import json +import logging + +from typing import Union +from http import HTTPStatus + +from scrapy.crawler import Crawler +from scrapy.exceptions import IgnoreRequest + +from scrapypuppeteer.response import PuppeteerResponse +from scrapypuppeteer.request import PuppeteerRequest + +restore_logger = logging.getLogger(__name__) + + +class PuppeteerContextRestoreDownloaderMiddleware: + """ + This middleware allows you to recover puppeteer context. + + If you want to recover puppeteer context starting from the specified first request provide + `recover_context` meta-key with `True` value. + + The middleware uses additionally these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + ... + + Settings: + + RESTORING_LENGTH: int = 1 - number of restorable requests in a sequence. + N_RETRY_RESTORING: int = 1 - number of tries to restore a context. + """ + + """ + WORK SCHEME: + + cases: + 1.) First PptrReq (without Context), Its response is good. After some request-response sequence it fails. + Trying to recover it N times. + 2.) First PptrReq (without Context), Its response is bad. We need to try to recover it N times. + + For recovering we use context. If we have it we get first request in sequence and trying to recover everything + from the beginning. + If we don't have it then we can send the request One more time in process_response until we get it. + """ + + N_RETRY_RESTORING_SETTING = "N_RETRY_RESTORING" + RESTORING_LENGTH_SETTING = "RESTORING_LENGTH" + + def __init__(self, restoring_length: int, n_retry_restoring: int): + self.restoring_length = restoring_length + self.n_retry_restoring = n_retry_restoring + self.context_requests = {} + self.context_length = {} + + @classmethod + def from_crawler(cls, crawler: Crawler): + restoring_length = crawler.settings.get(cls.RESTORING_LENGTH_SETTING, 1) + if not isinstance(restoring_length, int): + raise TypeError(f"`{cls.RESTORING_LENGTH_SETTING}` must be an integer, got {type(restoring_length)}") + elif restoring_length < 1: + raise ValueError(f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1") + + n_retry_restoring = crawler.settings.get(cls.N_RETRY_RESTORING_SETTING, 1) + if not isinstance(n_retry_restoring, int): + raise TypeError(f"`{cls.N_RETRY_RESTORING_SETTING}` must be an integer, got {type(n_retry_restoring)}") + elif n_retry_restoring < 1: + raise ValueError(f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1") + + return cls(restoring_length, n_retry_restoring) + + @staticmethod + def process_request(request, spider): + if not isinstance(request, PuppeteerRequest): + return None + + if not request.meta.pop('recover_context', False): + return None + + if request.context_id or request.page_id: + raise IgnoreRequest(f"Request {request} is not in the beginning of the request-response sequence") + + request.meta['__request_binding'] = True + request.dont_filter = True + return None + + def process_response(self, request, response, spider): + puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get('puppeteer_request', None) + request_binding = puppeteer_request is not None and puppeteer_request.meta.get('__request_binding', False) + + if isinstance(response, PuppeteerResponse): + if request_binding: + self._bind_context(request, response) + if response.context_id in self.context_length: + # Update number of actions in context + self.context_length[response.context_id] += 1 + elif puppeteer_request is not None and response.status == HTTPStatus.UNPROCESSABLE_ENTITY: + # One PuppeteerRequest has failed with 422 error + if request_binding: + # Could not get context, retry + if request.meta.get('__restore_count', 0) < self.n_retry_restoring: + request.meta['__restore_count'] += 1 + return request + else: + return self._restore_context(response) + return response + + def _bind_context(self, request, response): + if request.meta.get('__context_id', None) is not None: + # Need to update context_id + self.__delete_context(request.meta['__context_id'], None) + restoring_request = request.copy() + restoring_request.meta['__restore_count'] = restoring_request.meta.get('__restore_count', 0) + restoring_request.meta['__context_id'] = response.context_id + self.context_requests[response.context_id] = restoring_request + self.context_length[response.context_id] = 0 + + def _restore_context(self, response): + context_id = json.loads(response.text).get('contextId', None) + + if context_id in self.context_requests: + restoring_request = self.context_requests[context_id] + + if self.context_length[context_id] >= self.restoring_length + 1: + # Too many actions in context + self.__delete_context(context_id, f"Too many actions in context ({restoring_request}). Deleting it.") + elif restoring_request.meta['__restore_count'] >= self.n_retry_restoring: + # Too many retries + self.__delete_context(context_id, f"Too many retries in context ({restoring_request}). Deleting it.") + else: + # Restoring + restoring_request.meta['__restore_count'] += 1 + restore_logger.log(level=logging.DEBUG, + msg=f"Restoring the request {restoring_request}") + self.context_length[context_id] = 1 + return restoring_request + return response + + def __delete_context(self, context_id: str, reason: Union[str, None]): + del self.context_length[context_id] + del self.context_requests[context_id] + + if reason is not None: + restore_logger.log(level=logging.INFO, + msg=reason) diff --git a/scrapypuppeteer/middleware/service_middleware.py b/scrapypuppeteer/middleware/service_middleware.py new file mode 100644 index 0000000..aaa1321 --- /dev/null +++ b/scrapypuppeteer/middleware/service_middleware.py @@ -0,0 +1,208 @@ +import json + +from collections import defaultdict +from typing import List, Union +from urllib.parse import urlencode, urljoin +from http import HTTPStatus + +from scrapy import Request, signals +from scrapy.crawler import Crawler +from scrapy.http import Headers, TextResponse + +from scrapypuppeteer.actions import Click, GoBack, GoForward, GoTo, Screenshot, Scroll +from scrapypuppeteer.response import PuppeteerHtmlResponse, PuppeteerScreenshotResponse, PuppeteerJsonResponse +from scrapypuppeteer.request import ActionRequest, PuppeteerRequest + + +class PuppeteerServiceDownloaderMiddleware: + """ + This downloader middleware converts PuppeteerRequest instances to + Puppeteer service API requests and then converts its responses to + PuppeteerResponse instances. Additionally, it tracks all browser contexts + that spider uses and performs cleanup request to service once spider + is closed. + + Additionally, the middleware uses these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' + + Settings: + + PUPPETEER_SERVICE_URL (str) + Service URL, e.g. 'http://localhost:3000' + + PUPPETEER_INCLUDE_HEADERS (bool|list[str]) + Determines which request headers will be sent to remote site by puppeteer service. + Either True (all headers), False (no headers) or list of header names. + May be overridden per request. + By default, only cookies are sent. + + PUPPETEER_INCLUDE_META (bool) + Determines whether to send or not user's meta attached by user. + Default to False. + """ + + SERVICE_URL_SETTING = 'PUPPETEER_SERVICE_URL' + INCLUDE_HEADERS_SETTING = 'PUPPETEER_INCLUDE_HEADERS' + SERVICE_META_SETTING = 'PUPPETEER_INCLUDE_META' + DEFAULT_INCLUDE_HEADERS = ['Cookie'] # TODO send them separately + + def __init__(self, + crawler: Crawler, + service_url: str, + include_headers: Union[bool, List[str]], + include_meta: bool): + self.service_base_url = service_url + self.include_headers = include_headers + self.include_meta = include_meta + self.crawler = crawler + self.used_contexts = defaultdict(set) + + @classmethod + def from_crawler(cls, crawler): + service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) + if service_url is None: + raise ValueError('Puppeteer service URL must be provided') + if cls.INCLUDE_HEADERS_SETTING in crawler.settings: + try: + include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) + except ValueError: + include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) + else: + include_headers = cls.DEFAULT_INCLUDE_HEADERS + include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) + middleware = cls(crawler, service_url, include_headers, include_meta) + crawler.signals.connect(middleware.close_used_contexts, + signal=signals.spider_closed) + return middleware + + def process_request(self, request, spider): + if not isinstance(request, PuppeteerRequest): + return + + action = request.action + service_url = urljoin(self.service_base_url, action.endpoint) + service_params = self._encode_service_params(request) + if service_params: + service_url += '?' + service_params + + meta = { + 'puppeteer_request': request, + 'dont_obey_robotstxt': True, + 'proxy': None + } + if self.include_meta: + meta = { + **request.meta, + **meta + } + + return ActionRequest( + url=service_url, + action=action, + method='POST', + headers=Headers({'Content-Type': action.content_type}), + body=self._serialize_body(action, request), + dont_filter=True, + cookies=request.cookies, + priority=request.priority, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta=meta + ) + + @staticmethod + def _encode_service_params(request): + service_params = {} + if request.context_id is not None: + service_params['contextId'] = request.context_id + if request.page_id is not None: + service_params['pageId'] = request.page_id + if request.close_page: + service_params['closePage'] = 1 + return urlencode(service_params) + + def _serialize_body(self, action, request): + payload = action.payload() + if action.content_type == 'application/json': + if isinstance(payload, dict): + # disallow null values in top-level request parameters + payload = {k: v for k, v in payload.items() if v is not None} + proxy = request.meta.get('proxy') + if proxy: + payload['proxy'] = proxy + include_headers = self.include_headers if request.include_headers is None else request.include_headers + if include_headers: + headers = request.headers.to_unicode_dict() + if isinstance(include_headers, list): + headers = {h.lower(): headers[h] for h in include_headers if h in headers} + payload['headers'] = headers + return json.dumps(payload) + return str(payload) + + def process_response(self, request, response, spider): + if not isinstance(response, TextResponse): + return response + + puppeteer_request = request.meta.get('puppeteer_request') + if puppeteer_request is None: + return response + + if b'application/json' not in response.headers.get(b'Content-Type', b''): + return response.replace(request=request) + + response_data = json.loads(response.text) + response_cls = self._get_response_class(puppeteer_request.action) + + if response.status != HTTPStatus.OK: + context_id = response_data.get('contextId') + if context_id: + self.used_contexts[id(spider)].add(context_id) + return response + + return self._form_response(response_cls, response_data, + puppeteer_request.url, request, puppeteer_request, + spider) + + def _form_response(self, response_cls, response_data, + url, request, puppeteer_request, + spider): + context_id = response_data.pop('contextId', puppeteer_request.context_id) + page_id = response_data.pop('pageId', puppeteer_request.page_id) + + attributes = dict() + for attr in response_cls.attributes: + if attr in response_data: + attributes[attr] = response_data.pop(attr) + if response_data: + attributes['data'] = response_data + + self.used_contexts[id(spider)].add(context_id) + + return response_cls( + url=url, + puppeteer_request=puppeteer_request, + context_id=context_id, + page_id=page_id, + request=request, + **attributes + ) + + @staticmethod + def _get_response_class(request_action): + if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)): + return PuppeteerHtmlResponse + if isinstance(request_action, Screenshot): + return PuppeteerScreenshotResponse + return PuppeteerJsonResponse + + def close_used_contexts(self, spider): + contexts = list(self.used_contexts[id(spider)]) + if contexts: + request = Request(urljoin(self.service_base_url, '/close_context'), + method='POST', + headers=Headers({'Content-Type': 'application/json'}), + meta={"proxy": None}, + body=json.dumps(contexts)) + return self.crawler.engine.downloader.fetch(request, None) From 1adcd5e73253d1bafc341e8c0808e761d64049c3 Mon Sep 17 00:00:00 2001 From: matthew Date: Wed, 29 May 2024 18:36:18 +0300 Subject: [PATCH 09/18] Demo spider --- examples/spiders/dead_context.py | 74 +++++++++++++++++++ scrapypuppeteer/middleware/__init__.py | 6 +- .../{recaptcha_middleware.py => recaptcha.py} | 0 .../{restore_middleware.py => restore.py} | 0 .../{service_middleware.py => service.py} | 0 5 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 examples/spiders/dead_context.py rename scrapypuppeteer/middleware/{recaptcha_middleware.py => recaptcha.py} (100%) rename scrapypuppeteer/middleware/{restore_middleware.py => restore.py} (100%) rename scrapypuppeteer/middleware/{service_middleware.py => service.py} (100%) diff --git a/examples/spiders/dead_context.py b/examples/spiders/dead_context.py new file mode 100644 index 0000000..4c0a48c --- /dev/null +++ b/examples/spiders/dead_context.py @@ -0,0 +1,74 @@ +import scrapy + +from asyncio import sleep + +from scrapypuppeteer import PuppeteerRequest, PuppeteerResponse +from scrapypuppeteer.actions import Click, GoTo +from twisted.python.failure import Failure + + +class DeadContextSpider(scrapy.Spider): + custom_settings = { + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + 'DOWNLOADER_MIDDLEWARES': { + 'scrapypuppeteer.middleware.PuppeteerContextRestoreDownloaderMiddleware': 1041, + 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042, + }, + 'N_RETRY_RESTORING': 3, + 'RESTORING_LENGTH': 2, + } + name = "dead_context" + + def start_requests(self): + urls = [ + "https://www.google.com/recaptcha/api2/demo", + "https://scrapy.org", + "https://pptr.dev", + ] + + for url in urls: + yield PuppeteerRequest( + url, + callback=self.click_on_navigation, + errback=self.errback, + close_page=False, + meta={'recover_context': True} + ) + + async def click_on_navigation(self, response: PuppeteerResponse): + await sleep(4) + + click = Click("#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)") + yield response.follow( + click, + callback=self.click_back, + errback=self.errback, + close_page=False + ) + + async def click_back(self, response: PuppeteerResponse): + await sleep(4) + + click = Click("#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a.navbar__brand > b") + yield response.follow( + click, + callback=self.goto_api, + errback=self.errback, + close_page=False + ) + + async def goto_api(self, response): + await sleep(4) + + yield response.follow(GoTo("api/puppeteer.puppeteernode"), + callback=self.empty_action, + errback=self.errback, + close_page=False) + + @staticmethod + async def empty_action(response, **kwargs): + await sleep(4) + + @staticmethod + def errback(failure: Failure): + print(failure) diff --git a/scrapypuppeteer/middleware/__init__.py b/scrapypuppeteer/middleware/__init__.py index 9d63a08..fa2a319 100644 --- a/scrapypuppeteer/middleware/__init__.py +++ b/scrapypuppeteer/middleware/__init__.py @@ -1,3 +1,3 @@ -from .service_middleware import PuppeteerServiceDownloaderMiddleware -from .recaptcha_middleware import PuppeteerRecaptchaDownloaderMiddleware -from .restore_middleware import PuppeteerContextRestoreDownloaderMiddleware +from .service import PuppeteerServiceDownloaderMiddleware +from .recaptcha import PuppeteerRecaptchaDownloaderMiddleware +from .restore import PuppeteerContextRestoreDownloaderMiddleware diff --git a/scrapypuppeteer/middleware/recaptcha_middleware.py b/scrapypuppeteer/middleware/recaptcha.py similarity index 100% rename from scrapypuppeteer/middleware/recaptcha_middleware.py rename to scrapypuppeteer/middleware/recaptcha.py diff --git a/scrapypuppeteer/middleware/restore_middleware.py b/scrapypuppeteer/middleware/restore.py similarity index 100% rename from scrapypuppeteer/middleware/restore_middleware.py rename to scrapypuppeteer/middleware/restore.py diff --git a/scrapypuppeteer/middleware/service_middleware.py b/scrapypuppeteer/middleware/service.py similarity index 100% rename from scrapypuppeteer/middleware/service_middleware.py rename to scrapypuppeteer/middleware/service.py From de3a5bbdb3ec7e3f277b7456f4733cf6aaf8b3d3 Mon Sep 17 00:00:00 2001 From: matthew Date: Wed, 29 May 2024 18:49:27 +0300 Subject: [PATCH 10/18] docs --- README.md | 38 ++++++++++++++++++++++++++- scrapypuppeteer/middleware/restore.py | 15 +---------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index ff14805..3ecc377 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ Here is the list of available actions: - `Click(selector, click_options, wait_options)` - click on element on page - `Scroll(selector, wait_options)` - scroll page - `Screenshot(options)` - take screenshot -- `RecaptchaSolver(solve_recaptcha)` - find or solve recaptcha on page +- `RecaptchaSolver(solve_recaptcha, close_on_empty)` - find or solve recaptcha on page - `CustomJsAction(js_function)` - evaluate JS function on page Available options essentially mirror [service](https://github.com/ispras/scrapy-puppeteer-service) method parameters, which in turn mirror puppeteer API functions to some extent. @@ -130,6 +130,42 @@ and will notify you about number of found captchas on the page. If you don't want the middleware to work on specific request you may provide special meta key: `'dont_recaptcha': True`. In this case RecaptchaMiddleware will just skip the request. +## Automatic context restoring + +Sometimes you may receive responses with status 422 (Unprocessable Entity). +This means the scrapy-puppeteer-services struggled to find provided context or page in its memory. +In such situations you can use this middleware to restore such contexts. + +Enabling the middleware: +```Python +DOWNLOADER_MIDDLEWARES = { # Strict order of middlewares + # 'scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware': 1040, # You may also use recaptcha middleware + 'scrapypuppeteer.middleware.PuppeteerContextRestoreDownloaderMiddleware': 1041, + 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042, +} +``` + +Settings of the middleware: +```Python +N_RETRY_RESTORING = 3 # Number of tries to restore a context +RESTORING_LENGTH = 2 # Number of restorable requests in a sequence +``` + +Currently, the middleware can only restart from the beginning of request-response sequence. +You can start this sequence with `recover_context` meta-key, just provide `True` value. +Example: +```Python +... +yield PuppeteerRequest( + url, + callback=self.click_on_navigation, + errback=self.errback, + close_page=False, + meta={'recover_context': True} +) +... +``` + ## TODO - [x] skeleton that could handle goto, click, scroll, and actions diff --git a/scrapypuppeteer/middleware/restore.py b/scrapypuppeteer/middleware/restore.py index 43db9f9..6600972 100644 --- a/scrapypuppeteer/middleware/restore.py +++ b/scrapypuppeteer/middleware/restore.py @@ -22,7 +22,7 @@ class PuppeteerContextRestoreDownloaderMiddleware: The middleware uses additionally these meta-keys, do not use them, because their changing could possibly (almost probably) break determined behaviour: - ... + `__request_binding`, `__restore_count`, `__context_id`. Settings: @@ -30,19 +30,6 @@ class PuppeteerContextRestoreDownloaderMiddleware: N_RETRY_RESTORING: int = 1 - number of tries to restore a context. """ - """ - WORK SCHEME: - - cases: - 1.) First PptrReq (without Context), Its response is good. After some request-response sequence it fails. - Trying to recover it N times. - 2.) First PptrReq (without Context), Its response is bad. We need to try to recover it N times. - - For recovering we use context. If we have it we get first request in sequence and trying to recover everything - from the beginning. - If we don't have it then we can send the request One more time in process_response until we get it. - """ - N_RETRY_RESTORING_SETTING = "N_RETRY_RESTORING" RESTORING_LENGTH_SETTING = "RESTORING_LENGTH" From b43a1d5e087605493b9f207037bdad94b7180d57 Mon Sep 17 00:00:00 2001 From: matthew Date: Wed, 29 May 2024 19:15:41 +0300 Subject: [PATCH 11/18] Proper merging --- examples/spiders/dead_context.py | 40 +++---- scrapypuppeteer/middleware.py | 0 scrapypuppeteer/middleware/recaptcha.py | 127 +++++++++++++--------- scrapypuppeteer/middleware/restore.py | 76 ++++++++----- scrapypuppeteer/middleware/service.py | 139 ++++++++++++++---------- 5 files changed, 229 insertions(+), 153 deletions(-) delete mode 100644 scrapypuppeteer/middleware.py diff --git a/examples/spiders/dead_context.py b/examples/spiders/dead_context.py index 4c0a48c..93dbee5 100644 --- a/examples/spiders/dead_context.py +++ b/examples/spiders/dead_context.py @@ -10,12 +10,12 @@ class DeadContextSpider(scrapy.Spider): custom_settings = { "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", - 'DOWNLOADER_MIDDLEWARES': { - 'scrapypuppeteer.middleware.PuppeteerContextRestoreDownloaderMiddleware': 1041, - 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042, + "DOWNLOADER_MIDDLEWARES": { + "scrapypuppeteer.middleware.PuppeteerContextRestoreDownloaderMiddleware": 1041, + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, }, - 'N_RETRY_RESTORING': 3, - 'RESTORING_LENGTH': 2, + "N_RETRY_RESTORING": 3, + "RESTORING_LENGTH": 2, } name = "dead_context" @@ -32,38 +32,38 @@ def start_requests(self): callback=self.click_on_navigation, errback=self.errback, close_page=False, - meta={'recover_context': True} + meta={"recover_context": True}, ) async def click_on_navigation(self, response: PuppeteerResponse): await sleep(4) - click = Click("#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)") + click = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)" + ) yield response.follow( - click, - callback=self.click_back, - errback=self.errback, - close_page=False + click, callback=self.click_back, errback=self.errback, close_page=False ) async def click_back(self, response: PuppeteerResponse): await sleep(4) - click = Click("#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a.navbar__brand > b") + click = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a.navbar__brand > b" + ) yield response.follow( - click, - callback=self.goto_api, - errback=self.errback, - close_page=False + click, callback=self.goto_api, errback=self.errback, close_page=False ) async def goto_api(self, response): await sleep(4) - yield response.follow(GoTo("api/puppeteer.puppeteernode"), - callback=self.empty_action, - errback=self.errback, - close_page=False) + yield response.follow( + GoTo("api/puppeteer.puppeteernode"), + callback=self.empty_action, + errback=self.errback, + close_page=False, + ) @staticmethod async def empty_action(response, **kwargs): diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py deleted file mode 100644 index e69de29..0000000 diff --git a/scrapypuppeteer/middleware/recaptcha.py b/scrapypuppeteer/middleware/recaptcha.py index 36013a3..4755d9e 100644 --- a/scrapypuppeteer/middleware/recaptcha.py +++ b/scrapypuppeteer/middleware/recaptcha.py @@ -3,7 +3,13 @@ from scrapy.crawler import Crawler from scrapy.exceptions import IgnoreRequest, NotConfigured -from scrapypuppeteer.actions import Click, RecaptchaSolver, Screenshot, Scroll, CustomJsAction +from scrapypuppeteer.actions import ( + Click, + RecaptchaSolver, + Screenshot, + Scroll, + CustomJsAction, +) from scrapypuppeteer.response import PuppeteerResponse, PuppeteerHtmlResponse from scrapypuppeteer.request import PuppeteerRequest @@ -49,9 +55,7 @@ class PuppeteerRecaptchaDownloaderMiddleware: RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" - def __init__(self, - recaptcha_solving: bool, - submit_selectors: dict): + def __init__(self, recaptcha_solving: bool, submit_selectors: dict): self.submit_selectors = submit_selectors self.recaptcha_solving = recaptcha_solving self._page_responses = dict() @@ -65,109 +69,134 @@ def from_crawler(cls, crawler: Crawler): recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) try: - submit_selectors = crawler.settings.getdict(cls.SUBMIT_SELECTORS_SETTING, dict()) + submit_selectors = crawler.settings.getdict( + cls.SUBMIT_SELECTORS_SETTING, dict() + ) except ValueError: - submit_selectors = {'': crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, '')} + submit_selectors = { + "": crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, "") + } except Exception as exception: - raise ValueError(f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}") + raise ValueError( + f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}" + ) for key in submit_selectors.keys(): submit_selector = submit_selectors[key] if isinstance(submit_selector, str): submit_selectors[key] = Click(selector=submit_selector) elif not isinstance(submit_selector, Click): - raise TypeError(f"Submit selector must be str or Click, got {type(submit_selector)}") + raise TypeError( + f"Submit selector must be str or Click, got {type(submit_selector)}" + ) return cls(recaptcha_solving, submit_selectors) def process_request(self, request, spider): - if request.meta.get('dont_recaptcha', False): + if request.meta.get("dont_recaptcha", False): return None if isinstance(request, PuppeteerRequest): - if request.close_page and not request.meta.get('_captcha_submission', False): + if request.close_page and not request.meta.get( + "_captcha_submission", False + ): request.close_page = False request.dont_filter = True self._page_closing.add(request) return request return None - def process_response(self, - request, response, - spider): - if not isinstance(response, PuppeteerResponse): # We only work with PuppeteerResponses + def process_response(self, request, response, spider): + if not isinstance( + response, PuppeteerResponse + ): # We only work with PuppeteerResponses return response puppeteer_request = response.puppeteer_request - if puppeteer_request.meta.get('dont_recaptcha', False): # Skip such responses + if puppeteer_request.meta.get("dont_recaptcha", False): # Skip such responses return response - if puppeteer_request.meta.pop('_captcha_submission', False): # Submitted captcha + if puppeteer_request.meta.pop( + "_captcha_submission", False + ): # Submitted captcha return self.__gen_response(response) - if puppeteer_request.meta.pop('_captcha_solving', False): + if puppeteer_request.meta.pop("_captcha_solving", False): # RECaptchaSolver was called by recaptcha middleware return self._submit_recaptcha(request, response, spider) - if isinstance(puppeteer_request.action, - (Screenshot, Scroll, CustomJsAction, RecaptchaSolver)): - # No recaptcha after this action + if isinstance( + puppeteer_request.action, + (Screenshot, Scroll, CustomJsAction, RecaptchaSolver), + ): + # No recaptcha after these actions return response - # Any puppeteer response besides RecaptchaSolver's PuppeteerResponse + # Any puppeteer response besides PuppeteerRecaptchaSolverResponse return self._solve_recaptcha(request, response) def _solve_recaptcha(self, request, response): - self._page_responses[response.page_id] = response # Saving main response to return it later - - recaptcha_solver = RecaptchaSolver(solve_recaptcha=self.recaptcha_solving, - close_on_empty=self.__is_closing(response, remove_request=False)) - return response.follow(recaptcha_solver, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta={'_captcha_solving': True}, - close_page=False) + self._page_responses[response.page_id] = ( + response # Saving main response to return it later + ) + + recaptcha_solver = RecaptchaSolver( + solve_recaptcha=self.recaptcha_solving, + close_on_empty=self.__is_closing(response, remove_request=False), + ) + return response.follow( + recaptcha_solver, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta={"_captcha_solving": True}, + close_page=False, + ) def _submit_recaptcha(self, request, response, spider): - response_data = response.data if not response.puppeteer_request.action.solve_recaptcha: - recaptcha_logger.log(level=logging.INFO, - msg=f"Found {len(response_data['recaptcha_data']['captchas'])} captcha " - f"but did not solve due to argument", - ) + recaptcha_logger.log( + level=logging.INFO, + msg=f"Found {len(response.recaptcha_data['captchas'])} captcha " + f"but did not solve due to argument", + ) return self.__gen_response(response) # Click "submit button"? - if response_data['recaptcha_data']['captchas'] and self.submit_selectors: + if response.recaptcha_data["captchas"] and self.submit_selectors: # We need to click "submit button" for domain, submitting in self.submit_selectors.items(): if domain in response.url: if not submitting.selector: return self.__gen_response(response) - return response.follow(action=submitting, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - close_page=self.__is_closing(response), - meta={'_captcha_submission': True}) - raise IgnoreRequest("No submit selector found to click on the page but captcha found") + return response.follow( + action=submitting, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + close_page=self.__is_closing(response), + meta={"_captcha_submission": True}, + ) + raise IgnoreRequest( + "No submit selector found to click on the page but captcha found" + ) return self.__gen_response(response) def __gen_response(self, response): main_response_data = dict() - main_response_data['page_id'] = None if self.__is_closing(response) else response.puppeteer_request.page_id + main_response_data["page_id"] = ( + None if self.__is_closing(response) else response.puppeteer_request.page_id + ) main_response = self._page_responses.pop(response.page_id) if isinstance(main_response, PuppeteerHtmlResponse): if isinstance(response.puppeteer_request.action, RecaptchaSolver): - main_response_data['body'] = response.data['html'] + main_response_data["body"] = response.html elif isinstance(response.puppeteer_request.action, Click): - main_response_data['body'] = response.body + main_response_data["body"] = response.body return main_response.replace(**main_response_data) - def __is_closing(self, response, - remove_request: bool = True) -> bool: + def __is_closing(self, response, remove_request: bool = True) -> bool: main_request = self._page_responses[response.page_id].puppeteer_request close_page = main_request in self._page_closing if close_page and remove_request: diff --git a/scrapypuppeteer/middleware/restore.py b/scrapypuppeteer/middleware/restore.py index 6600972..db6e1ad 100644 --- a/scrapypuppeteer/middleware/restore.py +++ b/scrapypuppeteer/middleware/restore.py @@ -43,15 +43,23 @@ def __init__(self, restoring_length: int, n_retry_restoring: int): def from_crawler(cls, crawler: Crawler): restoring_length = crawler.settings.get(cls.RESTORING_LENGTH_SETTING, 1) if not isinstance(restoring_length, int): - raise TypeError(f"`{cls.RESTORING_LENGTH_SETTING}` must be an integer, got {type(restoring_length)}") + raise TypeError( + f"`{cls.RESTORING_LENGTH_SETTING}` must be an integer, got {type(restoring_length)}" + ) elif restoring_length < 1: - raise ValueError(f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1") + raise ValueError( + f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1" + ) n_retry_restoring = crawler.settings.get(cls.N_RETRY_RESTORING_SETTING, 1) if not isinstance(n_retry_restoring, int): - raise TypeError(f"`{cls.N_RETRY_RESTORING_SETTING}` must be an integer, got {type(n_retry_restoring)}") + raise TypeError( + f"`{cls.N_RETRY_RESTORING_SETTING}` must be an integer, got {type(n_retry_restoring)}" + ) elif n_retry_restoring < 1: - raise ValueError(f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1") + raise ValueError( + f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1" + ) return cls(restoring_length, n_retry_restoring) @@ -60,19 +68,25 @@ def process_request(request, spider): if not isinstance(request, PuppeteerRequest): return None - if not request.meta.pop('recover_context', False): + if not request.meta.pop("recover_context", False): return None if request.context_id or request.page_id: - raise IgnoreRequest(f"Request {request} is not in the beginning of the request-response sequence") + raise IgnoreRequest( + f"Request {request} is not in the beginning of the request-response sequence" + ) - request.meta['__request_binding'] = True + request.meta["__request_binding"] = True request.dont_filter = True return None def process_response(self, request, response, spider): - puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get('puppeteer_request', None) - request_binding = puppeteer_request is not None and puppeteer_request.meta.get('__request_binding', False) + puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get( + "puppeteer_request", None + ) + request_binding = puppeteer_request is not None and puppeteer_request.meta.get( + "__request_binding", False + ) if isinstance(response, PuppeteerResponse): if request_binding: @@ -80,44 +94,57 @@ def process_response(self, request, response, spider): if response.context_id in self.context_length: # Update number of actions in context self.context_length[response.context_id] += 1 - elif puppeteer_request is not None and response.status == HTTPStatus.UNPROCESSABLE_ENTITY: + elif ( + puppeteer_request is not None + and response.status == HTTPStatus.UNPROCESSABLE_ENTITY + ): # One PuppeteerRequest has failed with 422 error if request_binding: # Could not get context, retry - if request.meta.get('__restore_count', 0) < self.n_retry_restoring: - request.meta['__restore_count'] += 1 + if request.meta.get("__restore_count", 0) < self.n_retry_restoring: + request.meta["__restore_count"] += 1 return request else: return self._restore_context(response) return response def _bind_context(self, request, response): - if request.meta.get('__context_id', None) is not None: + if request.meta.get("__context_id", None) is not None: # Need to update context_id - self.__delete_context(request.meta['__context_id'], None) + self.__delete_context(request.meta["__context_id"], None) restoring_request = request.copy() - restoring_request.meta['__restore_count'] = restoring_request.meta.get('__restore_count', 0) - restoring_request.meta['__context_id'] = response.context_id + restoring_request.meta["__restore_count"] = restoring_request.meta.get( + "__restore_count", 0 + ) + restoring_request.meta["__context_id"] = response.context_id self.context_requests[response.context_id] = restoring_request self.context_length[response.context_id] = 0 def _restore_context(self, response): - context_id = json.loads(response.text).get('contextId', None) + context_id = json.loads(response.text).get("contextId", None) if context_id in self.context_requests: restoring_request = self.context_requests[context_id] if self.context_length[context_id] >= self.restoring_length + 1: # Too many actions in context - self.__delete_context(context_id, f"Too many actions in context ({restoring_request}). Deleting it.") - elif restoring_request.meta['__restore_count'] >= self.n_retry_restoring: + self.__delete_context( + context_id, + f"Too many actions in context ({restoring_request}). Deleting it.", + ) + elif restoring_request.meta["__restore_count"] >= self.n_retry_restoring: # Too many retries - self.__delete_context(context_id, f"Too many retries in context ({restoring_request}). Deleting it.") + self.__delete_context( + context_id, + f"Too many retries in context ({restoring_request}). Deleting it.", + ) else: # Restoring - restoring_request.meta['__restore_count'] += 1 - restore_logger.log(level=logging.DEBUG, - msg=f"Restoring the request {restoring_request}") + restoring_request.meta["__restore_count"] += 1 + restore_logger.log( + level=logging.DEBUG, + msg=f"Restoring the request {restoring_request}", + ) self.context_length[context_id] = 1 return restoring_request return response @@ -127,5 +154,4 @@ def __delete_context(self, context_id: str, reason: Union[str, None]): del self.context_requests[context_id] if reason is not None: - restore_logger.log(level=logging.INFO, - msg=reason) + restore_logger.log(level=logging.INFO, msg=reason) diff --git a/scrapypuppeteer/middleware/service.py b/scrapypuppeteer/middleware/service.py index aaa1321..49deef9 100644 --- a/scrapypuppeteer/middleware/service.py +++ b/scrapypuppeteer/middleware/service.py @@ -9,8 +9,21 @@ from scrapy.crawler import Crawler from scrapy.http import Headers, TextResponse -from scrapypuppeteer.actions import Click, GoBack, GoForward, GoTo, Screenshot, Scroll -from scrapypuppeteer.response import PuppeteerHtmlResponse, PuppeteerScreenshotResponse, PuppeteerJsonResponse +from scrapypuppeteer.actions import ( + Click, + GoBack, + GoForward, + GoTo, + RecaptchaSolver, + Screenshot, + Scroll, +) +from scrapypuppeteer.response import ( + PuppeteerHtmlResponse, + PuppeteerScreenshotResponse, + PuppeteerRecaptchaSolverResponse, + PuppeteerJsonResponse, +) from scrapypuppeteer.request import ActionRequest, PuppeteerRequest @@ -42,16 +55,18 @@ class PuppeteerServiceDownloaderMiddleware: Default to False. """ - SERVICE_URL_SETTING = 'PUPPETEER_SERVICE_URL' - INCLUDE_HEADERS_SETTING = 'PUPPETEER_INCLUDE_HEADERS' - SERVICE_META_SETTING = 'PUPPETEER_INCLUDE_META' - DEFAULT_INCLUDE_HEADERS = ['Cookie'] # TODO send them separately - - def __init__(self, - crawler: Crawler, - service_url: str, - include_headers: Union[bool, List[str]], - include_meta: bool): + SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" + INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" + SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" + DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + + def __init__( + self, + crawler: Crawler, + service_url: str, + include_headers: Union[bool, List[str]], + include_meta: bool, + ): self.service_base_url = service_url self.include_headers = include_headers self.include_meta = include_meta @@ -62,7 +77,7 @@ def __init__(self, def from_crawler(cls, crawler): service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) if service_url is None: - raise ValueError('Puppeteer service URL must be provided') + raise ValueError("Puppeteer service URL must be provided") if cls.INCLUDE_HEADERS_SETTING in crawler.settings: try: include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) @@ -72,8 +87,9 @@ def from_crawler(cls, crawler): include_headers = cls.DEFAULT_INCLUDE_HEADERS include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) middleware = cls(crawler, service_url, include_headers, include_meta) - crawler.signals.connect(middleware.close_used_contexts, - signal=signals.spider_closed) + crawler.signals.connect( + middleware.close_used_contexts, signal=signals.spider_closed + ) return middleware def process_request(self, request, spider): @@ -84,24 +100,21 @@ def process_request(self, request, spider): service_url = urljoin(self.service_base_url, action.endpoint) service_params = self._encode_service_params(request) if service_params: - service_url += '?' + service_params + service_url += "?" + service_params meta = { - 'puppeteer_request': request, - 'dont_obey_robotstxt': True, - 'proxy': None + "puppeteer_request": request, + "dont_obey_robotstxt": True, + "proxy": None, } if self.include_meta: - meta = { - **request.meta, - **meta - } + meta = {**request.meta, **meta} return ActionRequest( url=service_url, action=action, - method='POST', - headers=Headers({'Content-Type': action.content_type}), + method="POST", + headers=Headers({"Content-Type": action.content_type}), body=self._serialize_body(action, request), dont_filter=True, cookies=request.cookies, @@ -109,35 +122,41 @@ def process_request(self, request, spider): callback=request.callback, cb_kwargs=request.cb_kwargs, errback=request.errback, - meta=meta + meta=meta, ) @staticmethod def _encode_service_params(request): service_params = {} if request.context_id is not None: - service_params['contextId'] = request.context_id + service_params["contextId"] = request.context_id if request.page_id is not None: - service_params['pageId'] = request.page_id + service_params["pageId"] = request.page_id if request.close_page: - service_params['closePage'] = 1 + service_params["closePage"] = 1 return urlencode(service_params) def _serialize_body(self, action, request): payload = action.payload() - if action.content_type == 'application/json': + if action.content_type == "application/json": if isinstance(payload, dict): # disallow null values in top-level request parameters payload = {k: v for k, v in payload.items() if v is not None} - proxy = request.meta.get('proxy') + proxy = request.meta.get("proxy") if proxy: - payload['proxy'] = proxy - include_headers = self.include_headers if request.include_headers is None else request.include_headers + payload["proxy"] = proxy + include_headers = ( + self.include_headers + if request.include_headers is None + else request.include_headers + ) if include_headers: headers = request.headers.to_unicode_dict() if isinstance(include_headers, list): - headers = {h.lower(): headers[h] for h in include_headers if h in headers} - payload['headers'] = headers + headers = { + h.lower(): headers[h] for h in include_headers if h in headers + } + payload["headers"] = headers return json.dumps(payload) return str(payload) @@ -145,38 +164,36 @@ def process_response(self, request, response, spider): if not isinstance(response, TextResponse): return response - puppeteer_request = request.meta.get('puppeteer_request') + puppeteer_request = request.meta.get("puppeteer_request") if puppeteer_request is None: return response - if b'application/json' not in response.headers.get(b'Content-Type', b''): + if b"application/json" not in response.headers.get(b"Content-Type", b""): return response.replace(request=request) response_data = json.loads(response.text) response_cls = self._get_response_class(puppeteer_request.action) if response.status != HTTPStatus.OK: - context_id = response_data.get('contextId') + context_id = response_data.get("contextId") if context_id: self.used_contexts[id(spider)].add(context_id) return response - return self._form_response(response_cls, response_data, - puppeteer_request.url, request, puppeteer_request, - spider) - - def _form_response(self, response_cls, response_data, - url, request, puppeteer_request, - spider): - context_id = response_data.pop('contextId', puppeteer_request.context_id) - page_id = response_data.pop('pageId', puppeteer_request.page_id) + return self._form_response( + response_cls, + response_data, + puppeteer_request.url, + request, + puppeteer_request, + spider, + ) - attributes = dict() - for attr in response_cls.attributes: - if attr in response_data: - attributes[attr] = response_data.pop(attr) - if response_data: - attributes['data'] = response_data + def _form_response( + self, response_cls, response_data, url, request, puppeteer_request, spider + ): + context_id = response_data.pop("contextId", puppeteer_request.context_id) + page_id = response_data.pop("pageId", puppeteer_request.page_id) self.used_contexts[id(spider)].add(context_id) @@ -186,7 +203,7 @@ def _form_response(self, response_cls, response_data, context_id=context_id, page_id=page_id, request=request, - **attributes + **response_data, ) @staticmethod @@ -195,14 +212,18 @@ def _get_response_class(request_action): return PuppeteerHtmlResponse if isinstance(request_action, Screenshot): return PuppeteerScreenshotResponse + if isinstance(request_action, RecaptchaSolver): + return PuppeteerRecaptchaSolverResponse return PuppeteerJsonResponse def close_used_contexts(self, spider): contexts = list(self.used_contexts[id(spider)]) if contexts: - request = Request(urljoin(self.service_base_url, '/close_context'), - method='POST', - headers=Headers({'Content-Type': 'application/json'}), - meta={"proxy": None}, - body=json.dumps(contexts)) + request = Request( + urljoin(self.service_base_url, "/close_context"), + method="POST", + headers=Headers({"Content-Type": "application/json"}), + meta={"proxy": None}, + body=json.dumps(contexts), + ) return self.crawler.engine.downloader.fetch(request, None) From 66564619f595c159855ab6280aa1fb665b728be2 Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 27 Aug 2024 11:08:33 +0300 Subject: [PATCH 12/18] proper merging --- scrapypuppeteer/middleware.py | 330 +----------------- scrapypuppeteer/middleware/service.py | 229 ------------ .../{middleware => middlewares}/__init__.py | 0 .../{middleware => middlewares}/recaptcha.py | 0 .../{middleware => middlewares}/restore.py | 0 scrapypuppeteer/middlewares/service.py | 112 ++++++ 6 files changed, 129 insertions(+), 542 deletions(-) delete mode 100644 scrapypuppeteer/middleware/service.py rename scrapypuppeteer/{middleware => middlewares}/__init__.py (100%) rename scrapypuppeteer/{middleware => middlewares}/recaptcha.py (100%) rename scrapypuppeteer/{middleware => middlewares}/restore.py (100%) create mode 100644 scrapypuppeteer/middlewares/service.py diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index 99568c5..68d454a 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -1,320 +1,24 @@ -import logging -from collections import defaultdict -from typing import List, Union +import warnings -from scrapy import signals -from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest, NotConfigured +import scrapy.exceptions - -from scrapypuppeteer.actions import ( - Click, - RecaptchaSolver, - Screenshot, - Scroll, - CustomJsAction, -) -from scrapypuppeteer.response import ( - PuppeteerResponse, - PuppeteerHtmlResponse, -) -from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest -from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( - PyppeteerBrowserManager, +from .middlewares import ( + PuppeteerServiceDownloaderMiddleware, + PuppeteerRecaptchaDownloaderMiddleware, + PuppeteerContextRestoreDownloaderMiddleware, ) -from scrapypuppeteer.browser_managers.service_browser_manager import ( - ServiceBrowserManager, -) -from scrapypuppeteer.browser_managers.playwright_browser_manager import ( - PlaywrightBrowserManager, -) - -from scrapypuppeteer.browser_managers import BrowserManager - - -class PuppeteerServiceDownloaderMiddleware: - """ - This downloader middleware converts PuppeteerRequest instances to - Puppeteer service API requests and then converts its responses to - PuppeteerResponse instances. Additionally, it tracks all browser contexts - that spider uses and performs cleanup request to service right before - spider is closed. - - Additionally, the middleware uses these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' - - Settings: - - PUPPETEER_SERVICE_URL (str) - Service URL, e.g. 'http://localhost:3000' - - PUPPETEER_INCLUDE_HEADERS (bool|list[str]) - Determines which request headers will be sent to remote site by puppeteer service. - Either True (all headers), False (no headers) or list of header names. - May be overridden per request. - By default, only cookies are sent. - - PUPPETEER_INCLUDE_META (bool) - Determines whether to send or not user's meta attached by user. - Default to False. - """ - - SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" - INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" - SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" - DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately - - EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" - - service_logger = logging.getLogger(__name__) - - def __init__( - self, - crawler: Crawler, - service_url: str, - include_headers: Union[bool, List[str]], - include_meta: bool, - browser_manager: BrowserManager, - ): - self.service_base_url = service_url - self.include_headers = include_headers - self.include_meta = include_meta - self.crawler = crawler - self.used_contexts = defaultdict(set) - self.browser_manager = browser_manager - - @classmethod - def from_crawler(cls, crawler): - service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) - if cls.INCLUDE_HEADERS_SETTING in crawler.settings: - try: - include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) - except ValueError: - include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) - else: - include_headers = cls.DEFAULT_INCLUDE_HEADERS - include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) - - execution_method = crawler.settings.get( - cls.EXECUTION_METHOD_SETTING, "PUPPETEER" - ).lower() - - if execution_method == "pyppeteer": - browser_manager = PyppeteerBrowserManager() - elif execution_method == "puppeteer": - browser_manager = ServiceBrowserManager( - service_url, include_meta, include_headers, crawler - ) - elif execution_method == "playwright": - browser_manager = PlaywrightBrowserManager() - else: - raise NameError("Wrong EXECUTION_METHOD") - - middleware = cls( - crawler, service_url, include_headers, include_meta, browser_manager - ) - crawler.signals.connect( - middleware.browser_manager.close_used_contexts, signal=signals.spider_idle - ) - return middleware - - def process_request(self, request, spider): - return self.browser_manager.process_request(request) - - def process_response(self, request, response, spider): - return self.browser_manager.process_response(self, request, response, spider) - -class PuppeteerRecaptchaDownloaderMiddleware: - """ - This middleware is supposed to solve recaptcha on the page automatically. - If there is no captcha on the page then this middleware will do nothing - on the page, so your 2captcha balance will remain the same. - It can submit recaptcha if "submit button" is provided. - It will not "submit" captcha if there is no submit-selector. - If you want to turn Recaptcha solving off on the exact request provide - meta-key 'dont_recaptcha' with True value. The middleware will skip the request - through itself. - - The middleware uses additionally these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - '_captcha_submission', '_captcha_solving' - - Settings: - - RECAPTCHA_ACTIVATION: bool = True - activates or not the middleware (if not - raises NotConfigured) - RECAPTCHA_SOLVING: bool = True - whether solve captcha automatically or not - RECAPTCHA_SUBMIT_SELECTORS: str | dict = {} - dictionary consisting of domains and - these domains' submit selectors, e.g. - 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit' - it could be also squeezed to - 'ecaptcha/api2/de': '#recaptcha-demo-submit' - also you can use not just strings but Click actions with required parameters: - 'ogle.com/recaptcha': Click('#recaptcha-demo-submit') - In general - domain is a unique identifying string which is contained in web-page url - If there is no button to submit recaptcha then provide empty string to a domain. - This setting can also be a string. If so the middleware will only click the button - related to this selector. - This setting can also be unprovided. In this case every web-page you crawl is supposed to be - without submit button, or you manually do it yourself. - """ - - MIDDLEWARE_ACTIVATION_SETTING = "RECAPTCHA_ACTIVATION" - RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" - SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" - - def __init__(self, recaptcha_solving: bool, submit_selectors: dict): - self.submit_selectors = submit_selectors - self.recaptcha_solving = recaptcha_solving - self._page_responses = dict() - self._page_closing = set() - - @classmethod - def from_crawler(cls, crawler: Crawler): - activation = crawler.settings.get(cls.MIDDLEWARE_ACTIVATION_SETTING, True) - if not activation: - raise NotConfigured - recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) - - try: - submit_selectors = crawler.settings.getdict( - cls.SUBMIT_SELECTORS_SETTING, dict() - ) - except ValueError: - submit_selectors = { - "": crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, "") - } - except Exception as exception: - raise ValueError( - f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}" - ) - - for key in submit_selectors.keys(): - submit_selector = submit_selectors[key] - if isinstance(submit_selector, str): - submit_selectors[key] = Click(selector=submit_selector) - elif not isinstance(submit_selector, Click): - raise ValueError( - "Submit selector must be str or Click," - f"but {type(submit_selector)} provided" - ) - return cls(recaptcha_solving, submit_selectors) - - @staticmethod - def is_recaptcha_producing_action(action) -> bool: - return not isinstance( - action, - (Screenshot, Scroll, CustomJsAction, RecaptchaSolver), - ) - - def process_request(self, request, **_): - if request.meta.get("dont_recaptcha", False): - return None - - # Checking if we need to close page after action - if isinstance(request, PuppeteerRequest): - if self.is_recaptcha_producing_action(request.action): - if request.close_page and not request.meta.get( - "_captcha_submission", False - ): - request.close_page = False - request.dont_filter = True - self._page_closing.add(request) - return request - - def process_response(self, request, response, spider): - if not isinstance( - response, PuppeteerResponse - ): # We only work with PuppeteerResponses - return response - - puppeteer_request = response.puppeteer_request - if puppeteer_request.meta.get("dont_recaptcha", False): # Skip such responses - return response - - if puppeteer_request.meta.pop( - "_captcha_submission", False - ): # Submitted captcha - return self.__gen_response(response) - - if puppeteer_request.meta.pop("_captcha_solving", False): - # RECaptchaSolver was called by recaptcha middleware - return self._submit_recaptcha(request, response, spider) - - if not self.is_recaptcha_producing_action(puppeteer_request.action): - # No recaptcha after these actions - return response - - # Any puppeteer response besides PuppeteerRecaptchaSolverResponse - return self._solve_recaptcha(request, response) - - def _solve_recaptcha(self, request, response): - self._page_responses[response.page_id] = ( - response # Saving main response to return it later - ) - - recaptcha_solver = RecaptchaSolver( - solve_recaptcha=self.recaptcha_solving, - close_on_empty=self.__is_closing(response, remove_request=False), - ) - return response.follow( - recaptcha_solver, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta={"_captcha_solving": True}, - close_page=False, - ) - - def _submit_recaptcha(self, request, response, spider): - if not response.puppeteer_request.action.solve_recaptcha: - spider.log( - message=f"Found {len(response.recaptcha_data['captchas'])} captcha " - f"but did not solve due to argument", - level=logging.INFO, - ) - return self.__gen_response(response) - # Click "submit button"? - if response.recaptcha_data["captchas"] and self.submit_selectors: - # We need to click "submit button" - for domain, submitting in self.submit_selectors.items(): - if domain in response.url: - if not submitting.selector: - return self.__gen_response(response) - return response.follow( - action=submitting, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - close_page=self.__is_closing(response), - meta={"_captcha_submission": True}, - ) - raise IgnoreRequest( - "No submit selector found to click on the page but captcha found" - ) - return self.__gen_response(response) - - def __gen_response(self, response): - main_response_data = dict() - main_response_data["page_id"] = ( - None if self.__is_closing(response) else response.puppeteer_request.page_id - ) - - main_response = self._page_responses.pop(response.page_id) - - if isinstance(main_response, PuppeteerHtmlResponse): - if isinstance(response.puppeteer_request.action, RecaptchaSolver): - main_response_data["body"] = response.html - elif isinstance(response.puppeteer_request.action, Click): - main_response_data["body"] = response.body +warnings.warn( + "Import from `scrapypuppeteer.middleware` is deprecated. " + "Use `scrapypuppeteer.middlewares` instead.", + scrapy.exceptions.ScrapyDeprecationWarning, + stacklevel=2, +) - return main_response.replace(**main_response_data) - def __is_closing(self, response, remove_request: bool = True) -> bool: - main_request = self._page_responses[response.page_id].puppeteer_request - close_page = main_request in self._page_closing - if close_page and remove_request: - self._page_closing.remove(main_request) - return close_page +__all__ = [ + "PuppeteerServiceDownloaderMiddleware", + "PuppeteerRecaptchaDownloaderMiddleware", + "PuppeteerContextRestoreDownloaderMiddleware", +] diff --git a/scrapypuppeteer/middleware/service.py b/scrapypuppeteer/middleware/service.py deleted file mode 100644 index 49deef9..0000000 --- a/scrapypuppeteer/middleware/service.py +++ /dev/null @@ -1,229 +0,0 @@ -import json - -from collections import defaultdict -from typing import List, Union -from urllib.parse import urlencode, urljoin -from http import HTTPStatus - -from scrapy import Request, signals -from scrapy.crawler import Crawler -from scrapy.http import Headers, TextResponse - -from scrapypuppeteer.actions import ( - Click, - GoBack, - GoForward, - GoTo, - RecaptchaSolver, - Screenshot, - Scroll, -) -from scrapypuppeteer.response import ( - PuppeteerHtmlResponse, - PuppeteerScreenshotResponse, - PuppeteerRecaptchaSolverResponse, - PuppeteerJsonResponse, -) -from scrapypuppeteer.request import ActionRequest, PuppeteerRequest - - -class PuppeteerServiceDownloaderMiddleware: - """ - This downloader middleware converts PuppeteerRequest instances to - Puppeteer service API requests and then converts its responses to - PuppeteerResponse instances. Additionally, it tracks all browser contexts - that spider uses and performs cleanup request to service once spider - is closed. - - Additionally, the middleware uses these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' - - Settings: - - PUPPETEER_SERVICE_URL (str) - Service URL, e.g. 'http://localhost:3000' - - PUPPETEER_INCLUDE_HEADERS (bool|list[str]) - Determines which request headers will be sent to remote site by puppeteer service. - Either True (all headers), False (no headers) or list of header names. - May be overridden per request. - By default, only cookies are sent. - - PUPPETEER_INCLUDE_META (bool) - Determines whether to send or not user's meta attached by user. - Default to False. - """ - - SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" - INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" - SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" - DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately - - def __init__( - self, - crawler: Crawler, - service_url: str, - include_headers: Union[bool, List[str]], - include_meta: bool, - ): - self.service_base_url = service_url - self.include_headers = include_headers - self.include_meta = include_meta - self.crawler = crawler - self.used_contexts = defaultdict(set) - - @classmethod - def from_crawler(cls, crawler): - service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) - if service_url is None: - raise ValueError("Puppeteer service URL must be provided") - if cls.INCLUDE_HEADERS_SETTING in crawler.settings: - try: - include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) - except ValueError: - include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) - else: - include_headers = cls.DEFAULT_INCLUDE_HEADERS - include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) - middleware = cls(crawler, service_url, include_headers, include_meta) - crawler.signals.connect( - middleware.close_used_contexts, signal=signals.spider_closed - ) - return middleware - - def process_request(self, request, spider): - if not isinstance(request, PuppeteerRequest): - return - - action = request.action - service_url = urljoin(self.service_base_url, action.endpoint) - service_params = self._encode_service_params(request) - if service_params: - service_url += "?" + service_params - - meta = { - "puppeteer_request": request, - "dont_obey_robotstxt": True, - "proxy": None, - } - if self.include_meta: - meta = {**request.meta, **meta} - - return ActionRequest( - url=service_url, - action=action, - method="POST", - headers=Headers({"Content-Type": action.content_type}), - body=self._serialize_body(action, request), - dont_filter=True, - cookies=request.cookies, - priority=request.priority, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta=meta, - ) - - @staticmethod - def _encode_service_params(request): - service_params = {} - if request.context_id is not None: - service_params["contextId"] = request.context_id - if request.page_id is not None: - service_params["pageId"] = request.page_id - if request.close_page: - service_params["closePage"] = 1 - return urlencode(service_params) - - def _serialize_body(self, action, request): - payload = action.payload() - if action.content_type == "application/json": - if isinstance(payload, dict): - # disallow null values in top-level request parameters - payload = {k: v for k, v in payload.items() if v is not None} - proxy = request.meta.get("proxy") - if proxy: - payload["proxy"] = proxy - include_headers = ( - self.include_headers - if request.include_headers is None - else request.include_headers - ) - if include_headers: - headers = request.headers.to_unicode_dict() - if isinstance(include_headers, list): - headers = { - h.lower(): headers[h] for h in include_headers if h in headers - } - payload["headers"] = headers - return json.dumps(payload) - return str(payload) - - def process_response(self, request, response, spider): - if not isinstance(response, TextResponse): - return response - - puppeteer_request = request.meta.get("puppeteer_request") - if puppeteer_request is None: - return response - - if b"application/json" not in response.headers.get(b"Content-Type", b""): - return response.replace(request=request) - - response_data = json.loads(response.text) - response_cls = self._get_response_class(puppeteer_request.action) - - if response.status != HTTPStatus.OK: - context_id = response_data.get("contextId") - if context_id: - self.used_contexts[id(spider)].add(context_id) - return response - - return self._form_response( - response_cls, - response_data, - puppeteer_request.url, - request, - puppeteer_request, - spider, - ) - - def _form_response( - self, response_cls, response_data, url, request, puppeteer_request, spider - ): - context_id = response_data.pop("contextId", puppeteer_request.context_id) - page_id = response_data.pop("pageId", puppeteer_request.page_id) - - self.used_contexts[id(spider)].add(context_id) - - return response_cls( - url=url, - puppeteer_request=puppeteer_request, - context_id=context_id, - page_id=page_id, - request=request, - **response_data, - ) - - @staticmethod - def _get_response_class(request_action): - if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)): - return PuppeteerHtmlResponse - if isinstance(request_action, Screenshot): - return PuppeteerScreenshotResponse - if isinstance(request_action, RecaptchaSolver): - return PuppeteerRecaptchaSolverResponse - return PuppeteerJsonResponse - - def close_used_contexts(self, spider): - contexts = list(self.used_contexts[id(spider)]) - if contexts: - request = Request( - urljoin(self.service_base_url, "/close_context"), - method="POST", - headers=Headers({"Content-Type": "application/json"}), - meta={"proxy": None}, - body=json.dumps(contexts), - ) - return self.crawler.engine.downloader.fetch(request, None) diff --git a/scrapypuppeteer/middleware/__init__.py b/scrapypuppeteer/middlewares/__init__.py similarity index 100% rename from scrapypuppeteer/middleware/__init__.py rename to scrapypuppeteer/middlewares/__init__.py diff --git a/scrapypuppeteer/middleware/recaptcha.py b/scrapypuppeteer/middlewares/recaptcha.py similarity index 100% rename from scrapypuppeteer/middleware/recaptcha.py rename to scrapypuppeteer/middlewares/recaptcha.py diff --git a/scrapypuppeteer/middleware/restore.py b/scrapypuppeteer/middlewares/restore.py similarity index 100% rename from scrapypuppeteer/middleware/restore.py rename to scrapypuppeteer/middlewares/restore.py diff --git a/scrapypuppeteer/middlewares/service.py b/scrapypuppeteer/middlewares/service.py new file mode 100644 index 0000000..4ec41a8 --- /dev/null +++ b/scrapypuppeteer/middlewares/service.py @@ -0,0 +1,112 @@ +import logging +from collections import defaultdict +from typing import List, Union + +from scrapy import signals +from scrapy.crawler import Crawler + +from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( + PyppeteerBrowserManager, +) +from scrapypuppeteer.browser_managers.service_browser_manager import ( + ServiceBrowserManager, +) +from scrapypuppeteer.browser_managers.playwright_browser_manager import ( + PlaywrightBrowserManager, +) + +from scrapypuppeteer.browser_managers import BrowserManager + + +class PuppeteerServiceDownloaderMiddleware: + """ + This downloader middleware converts PuppeteerRequest instances to + Puppeteer service API requests and then converts its responses to + PuppeteerResponse instances. Additionally, it tracks all browser contexts + that spider uses and performs cleanup request to service right before + spider is closed. + + Additionally, the middleware uses these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' + + Settings: + + PUPPETEER_SERVICE_URL (str) + Service URL, e.g. 'http://localhost:3000' + + PUPPETEER_INCLUDE_HEADERS (bool|list[str]) + Determines which request headers will be sent to remote site by puppeteer service. + Either True (all headers), False (no headers) or list of header names. + May be overridden per request. + By default, only cookies are sent. + + PUPPETEER_INCLUDE_META (bool) + Determines whether to send or not user's meta attached by user. + Default to False. + """ + + SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" + INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" + SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" + DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + + EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" + + service_logger = logging.getLogger(__name__) + + def __init__( + self, + crawler: Crawler, + service_url: str, + include_headers: Union[bool, List[str]], + include_meta: bool, + browser_manager: BrowserManager, + ): + self.service_base_url = service_url + self.include_headers = include_headers + self.include_meta = include_meta + self.crawler = crawler + self.used_contexts = defaultdict(set) + self.browser_manager = browser_manager + + @classmethod + def from_crawler(cls, crawler): + service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) + if cls.INCLUDE_HEADERS_SETTING in crawler.settings: + try: + include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) + except ValueError: + include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) + else: + include_headers = cls.DEFAULT_INCLUDE_HEADERS + include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) + + execution_method = crawler.settings.get( + cls.EXECUTION_METHOD_SETTING, "PUPPETEER" + ).lower() + + if execution_method == "pyppeteer": + browser_manager = PyppeteerBrowserManager() + elif execution_method == "puppeteer": + browser_manager = ServiceBrowserManager( + service_url, include_meta, include_headers, crawler + ) + elif execution_method == "playwright": + browser_manager = PlaywrightBrowserManager() + else: + raise NameError("Wrong EXECUTION_METHOD") + + middleware = cls( + crawler, service_url, include_headers, include_meta, browser_manager + ) + crawler.signals.connect( + middleware.browser_manager.close_used_contexts, signal=signals.spider_idle + ) + return middleware + + def process_request(self, request, spider): + return self.browser_manager.process_request(request) + + def process_response(self, request, response, spider): + return self.browser_manager.process_response(self, request, response, spider) From 8683cd4e0d1d5ab9e9d4239ee570110e30228052 Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 22 Oct 2024 16:14:49 +0300 Subject: [PATCH 13/18] fix linter --- examples/spiders/dead_context.py | 6 +++--- scrapypuppeteer/middleware.py | 5 ++--- scrapypuppeteer/middlewares/__init__.py | 2 +- scrapypuppeteer/middlewares/recaptcha.py | 4 ++-- scrapypuppeteer/middlewares/restore.py | 5 ++--- scrapypuppeteer/middlewares/service.py | 9 ++++----- 6 files changed, 14 insertions(+), 17 deletions(-) diff --git a/examples/spiders/dead_context.py b/examples/spiders/dead_context.py index 93dbee5..3252a8e 100644 --- a/examples/spiders/dead_context.py +++ b/examples/spiders/dead_context.py @@ -1,10 +1,10 @@ -import scrapy - from asyncio import sleep +import scrapy +from twisted.python.failure import Failure + from scrapypuppeteer import PuppeteerRequest, PuppeteerResponse from scrapypuppeteer.actions import Click, GoTo -from twisted.python.failure import Failure class DeadContextSpider(scrapy.Spider): diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index 68d454a..bbfbe2f 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -3,12 +3,11 @@ import scrapy.exceptions from .middlewares import ( - PuppeteerServiceDownloaderMiddleware, - PuppeteerRecaptchaDownloaderMiddleware, PuppeteerContextRestoreDownloaderMiddleware, + PuppeteerRecaptchaDownloaderMiddleware, + PuppeteerServiceDownloaderMiddleware, ) - warnings.warn( "Import from `scrapypuppeteer.middleware` is deprecated. " "Use `scrapypuppeteer.middlewares` instead.", diff --git a/scrapypuppeteer/middlewares/__init__.py b/scrapypuppeteer/middlewares/__init__.py index fa2a319..80280d1 100644 --- a/scrapypuppeteer/middlewares/__init__.py +++ b/scrapypuppeteer/middlewares/__init__.py @@ -1,3 +1,3 @@ -from .service import PuppeteerServiceDownloaderMiddleware from .recaptcha import PuppeteerRecaptchaDownloaderMiddleware from .restore import PuppeteerContextRestoreDownloaderMiddleware +from .service import PuppeteerServiceDownloaderMiddleware diff --git a/scrapypuppeteer/middlewares/recaptcha.py b/scrapypuppeteer/middlewares/recaptcha.py index 4755d9e..da8bdd1 100644 --- a/scrapypuppeteer/middlewares/recaptcha.py +++ b/scrapypuppeteer/middlewares/recaptcha.py @@ -5,13 +5,13 @@ from scrapypuppeteer.actions import ( Click, + CustomJsAction, RecaptchaSolver, Screenshot, Scroll, - CustomJsAction, ) -from scrapypuppeteer.response import PuppeteerResponse, PuppeteerHtmlResponse from scrapypuppeteer.request import PuppeteerRequest +from scrapypuppeteer.response import PuppeteerHtmlResponse, PuppeteerResponse recaptcha_logger = logging.getLogger(__name__) diff --git a/scrapypuppeteer/middlewares/restore.py b/scrapypuppeteer/middlewares/restore.py index db6e1ad..5da4e10 100644 --- a/scrapypuppeteer/middlewares/restore.py +++ b/scrapypuppeteer/middlewares/restore.py @@ -1,14 +1,13 @@ import json import logging - -from typing import Union from http import HTTPStatus +from typing import Union from scrapy.crawler import Crawler from scrapy.exceptions import IgnoreRequest -from scrapypuppeteer.response import PuppeteerResponse from scrapypuppeteer.request import PuppeteerRequest +from scrapypuppeteer.response import PuppeteerResponse restore_logger = logging.getLogger(__name__) diff --git a/scrapypuppeteer/middlewares/service.py b/scrapypuppeteer/middlewares/service.py index 4ec41a8..b34d66e 100644 --- a/scrapypuppeteer/middlewares/service.py +++ b/scrapypuppeteer/middlewares/service.py @@ -5,17 +5,16 @@ from scrapy import signals from scrapy.crawler import Crawler +from scrapypuppeteer.browser_managers import BrowserManager +from scrapypuppeteer.browser_managers.playwright_browser_manager import ( + PlaywrightBrowserManager, +) from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( PyppeteerBrowserManager, ) from scrapypuppeteer.browser_managers.service_browser_manager import ( ServiceBrowserManager, ) -from scrapypuppeteer.browser_managers.playwright_browser_manager import ( - PlaywrightBrowserManager, -) - -from scrapypuppeteer.browser_managers import BrowserManager class PuppeteerServiceDownloaderMiddleware: From 36f223875b3dee7717bee75887c52bafad44e24f Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 31 Oct 2024 12:31:46 +0300 Subject: [PATCH 14/18] updated restore middleware --- scrapypuppeteer/middlewares/restore.py | 114 ++++++++++++------------- setup.py | 2 +- 2 files changed, 54 insertions(+), 62 deletions(-) diff --git a/scrapypuppeteer/middlewares/restore.py b/scrapypuppeteer/middlewares/restore.py index 5da4e10..79dea96 100644 --- a/scrapypuppeteer/middlewares/restore.py +++ b/scrapypuppeteer/middlewares/restore.py @@ -1,20 +1,20 @@ import json import logging from http import HTTPStatus -from typing import Union +from typing import Union, Dict from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest -from scrapypuppeteer.request import PuppeteerRequest +from scrapypuppeteer.actions import Compose +from scrapypuppeteer.request import ActionRequest, PuppeteerRequest from scrapypuppeteer.response import PuppeteerResponse -restore_logger = logging.getLogger(__name__) - class PuppeteerContextRestoreDownloaderMiddleware: """ This middleware allows you to recover puppeteer context. + The middleware supposes that restored requests + would have the same effect as original requests. If you want to recover puppeteer context starting from the specified first request provide `recover_context` meta-key with `True` value. @@ -29,14 +29,15 @@ class PuppeteerContextRestoreDownloaderMiddleware: N_RETRY_RESTORING: int = 1 - number of tries to restore a context. """ + restore_logger = logging.getLogger(__name__) + N_RETRY_RESTORING_SETTING = "N_RETRY_RESTORING" RESTORING_LENGTH_SETTING = "RESTORING_LENGTH" def __init__(self, restoring_length: int, n_retry_restoring: int): self.restoring_length = restoring_length self.n_retry_restoring = n_retry_restoring - self.context_requests = {} - self.context_length = {} + self.context_actions: Dict[str, Compose] = {} @classmethod def from_crawler(cls, crawler: Crawler): @@ -47,7 +48,7 @@ def from_crawler(cls, crawler: Crawler): ) elif restoring_length < 1: raise ValueError( - f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1" + f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1, got {restoring_length}" ) n_retry_restoring = crawler.settings.get(cls.N_RETRY_RESTORING_SETTING, 1) @@ -57,13 +58,12 @@ def from_crawler(cls, crawler: Crawler): ) elif n_retry_restoring < 1: raise ValueError( - f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1" + f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1, got {n_retry_restoring}" ) return cls(restoring_length, n_retry_restoring) - @staticmethod - def process_request(request, spider): + def process_request(self, request, spider): if not isinstance(request, PuppeteerRequest): return None @@ -71,12 +71,13 @@ def process_request(request, spider): return None if request.context_id or request.page_id: - raise IgnoreRequest( - f"Request {request} is not in the beginning of the request-response sequence" + self.restore_logger.warning( + f"Request {request} is not in the beginning of the request-response sequence." + "Cannot 'restore' this sequence, skipping." ) + return None request.meta["__request_binding"] = True - request.dont_filter = True return None def process_response(self, request, response, spider): @@ -89,10 +90,10 @@ def process_response(self, request, response, spider): if isinstance(response, PuppeteerResponse): if request_binding: - self._bind_context(request, response) - if response.context_id in self.context_length: - # Update number of actions in context - self.context_length[response.context_id] += 1 + self.context_actions[response.context_id] = Compose(request.action) + if response.context_id in self.context_actions: + # Update actions in context + self._update_context_actions(request, response) elif ( puppeteer_request is not None and response.status == HTTPStatus.UNPROCESSABLE_ENTITY @@ -100,57 +101,48 @@ def process_response(self, request, response, spider): # One PuppeteerRequest has failed with 422 error if request_binding: # Could not get context, retry - if request.meta.get("__restore_count", 0) < self.n_retry_restoring: - request.meta["__restore_count"] += 1 - return request + if request.meta.get("__request_binding_count", 0) < self.n_retry_restoring: + new_request = request.copy() + new_request.meta["__request_binding_count"] += 1 + return new_request else: - return self._restore_context(response) + return self._restore_context(puppeteer_request, response) return response - def _bind_context(self, request, response): - if request.meta.get("__context_id", None) is not None: - # Need to update context_id - self.__delete_context(request.meta["__context_id"], None) - restoring_request = request.copy() - restoring_request.meta["__restore_count"] = restoring_request.meta.get( - "__restore_count", 0 - ) - restoring_request.meta["__context_id"] = response.context_id - self.context_requests[response.context_id] = restoring_request - self.context_length[response.context_id] = 0 + def _update_context_actions(self, request: ActionRequest, response: PuppeteerResponse): + context_id = response.context_id + context_actions = self.context_actions[context_id] + + if len(context_actions.actions) > self.restoring_length: + self.__delete_context( + context_id, + f"Too many actions in context ({context_id}). Deleting it.", + ) + else: + self.context_actions[response.context_id] = Compose( + context_actions, + request.action, + ) - def _restore_context(self, response): + def _restore_context(self, puppeteer_request: PuppeteerRequest, response): context_id = json.loads(response.text).get("contextId", None) - if context_id in self.context_requests: - restoring_request = self.context_requests[context_id] - - if self.context_length[context_id] >= self.restoring_length + 1: - # Too many actions in context - self.__delete_context( - context_id, - f"Too many actions in context ({restoring_request}). Deleting it.", - ) - elif restoring_request.meta["__restore_count"] >= self.n_retry_restoring: - # Too many retries - self.__delete_context( - context_id, - f"Too many retries in context ({restoring_request}). Deleting it.", - ) - else: - # Restoring - restoring_request.meta["__restore_count"] += 1 - restore_logger.log( - level=logging.DEBUG, - msg=f"Restoring the request {restoring_request}", - ) - self.context_length[context_id] = 1 - return restoring_request + if context_id in self.context_actions: + # Restoring + restoring_request = puppeteer_request.copy() + restoring_request.meta["__restore_count"] += 1 + restoring_request.action = self.context_actions.pop(context_id) + self.restore_logger.log( + level=logging.DEBUG, + msg=f"Restoring the context with context_id {context_id}", + ) + return restoring_request + + self.restore_logger.warning(f"Context_id {context_id} not in context_actions.") return response def __delete_context(self, context_id: str, reason: Union[str, None]): - del self.context_length[context_id] - del self.context_requests[context_id] + del self.context_actions[context_id] if reason is not None: - restore_logger.log(level=logging.INFO, msg=reason) + self.restore_logger.log(level=logging.INFO, msg=reason) diff --git a/setup.py b/setup.py index b9b7750..50b3597 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def read_long_description(file_path): setup( name="scrapy-puppeteer-client", - version="0.3.8", + version="0.3.9", description="A library to use Puppeteer-managed browser in Scrapy spiders", long_description=read_long_description("README.md"), long_description_content_type="text/markdown", From 1edf840e1d560ea25ec193ab6f29eeb25d03fbf3 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 31 Oct 2024 13:07:15 +0300 Subject: [PATCH 15/18] fixed restore middleware --- scrapypuppeteer/middlewares/restore.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scrapypuppeteer/middlewares/restore.py b/scrapypuppeteer/middlewares/restore.py index 79dea96..13cc68b 100644 --- a/scrapypuppeteer/middlewares/restore.py +++ b/scrapypuppeteer/middlewares/restore.py @@ -91,12 +91,12 @@ def process_response(self, request, response, spider): if isinstance(response, PuppeteerResponse): if request_binding: self.context_actions[response.context_id] = Compose(request.action) - if response.context_id in self.context_actions: + elif response.context_id in self.context_actions: # Update actions in context self._update_context_actions(request, response) elif ( - puppeteer_request is not None - and response.status == HTTPStatus.UNPROCESSABLE_ENTITY + puppeteer_request is not None + and response.status == HTTPStatus.UNPROCESSABLE_ENTITY ): # One PuppeteerRequest has failed with 422 error if request_binding: @@ -129,9 +129,12 @@ def _restore_context(self, puppeteer_request: PuppeteerRequest, response): if context_id in self.context_actions: # Restoring - restoring_request = puppeteer_request.copy() - restoring_request.meta["__restore_count"] += 1 - restoring_request.action = self.context_actions.pop(context_id) + restoring_request = puppeteer_request.replace( + action=Compose(self.context_actions.pop(context_id), puppeteer_request.action), + context_id=None, + page_id=None, + ) + restoring_request.meta["__request_binding"] = True self.restore_logger.log( level=logging.DEBUG, msg=f"Restoring the context with context_id {context_id}", From c80529262db075f93722dd09f39f27f42984bc9d Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 31 Oct 2024 13:10:25 +0300 Subject: [PATCH 16/18] docs --- README.md | 5 ++++- scrapypuppeteer/middlewares/restore.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ef5d74d..48665ee 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ In this case RecaptchaMiddleware will just skip the request. Sometimes you may receive responses with status 422 (Unprocessable Entity). This means the scrapy-puppeteer-services struggled to find provided context or page in its memory. -In such situations you can use this middleware to restore such contexts. +In such situations you can use this middleware to restore these contexts. Enabling the middleware: ```Python @@ -211,6 +211,9 @@ yield PuppeteerRequest( ... ``` +Also, you can see `dead_context` spider and try to enable `PuppeteerContextRestoreDownloaderMiddleware` in its `custom_settings` +to see the working middleware. + ## TODO - [x] skeleton that could handle goto, click, scroll, and actions diff --git a/scrapypuppeteer/middlewares/restore.py b/scrapypuppeteer/middlewares/restore.py index 13cc68b..ba841c3 100644 --- a/scrapypuppeteer/middlewares/restore.py +++ b/scrapypuppeteer/middlewares/restore.py @@ -21,7 +21,7 @@ class PuppeteerContextRestoreDownloaderMiddleware: The middleware uses additionally these meta-keys, do not use them, because their changing could possibly (almost probably) break determined behaviour: - `__request_binding`, `__restore_count`, `__context_id`. + `__request_binding` Settings: From 27794a0bd0172fa6afd1d6b6801e27e9baf0a420 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 31 Oct 2024 13:11:41 +0300 Subject: [PATCH 17/18] formatter --- scrapypuppeteer/middlewares/restore.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/scrapypuppeteer/middlewares/restore.py b/scrapypuppeteer/middlewares/restore.py index ba841c3..9efc5c4 100644 --- a/scrapypuppeteer/middlewares/restore.py +++ b/scrapypuppeteer/middlewares/restore.py @@ -95,13 +95,16 @@ def process_response(self, request, response, spider): # Update actions in context self._update_context_actions(request, response) elif ( - puppeteer_request is not None - and response.status == HTTPStatus.UNPROCESSABLE_ENTITY + puppeteer_request is not None + and response.status == HTTPStatus.UNPROCESSABLE_ENTITY ): # One PuppeteerRequest has failed with 422 error if request_binding: # Could not get context, retry - if request.meta.get("__request_binding_count", 0) < self.n_retry_restoring: + if ( + request.meta.get("__request_binding_count", 0) + < self.n_retry_restoring + ): new_request = request.copy() new_request.meta["__request_binding_count"] += 1 return new_request @@ -109,7 +112,9 @@ def process_response(self, request, response, spider): return self._restore_context(puppeteer_request, response) return response - def _update_context_actions(self, request: ActionRequest, response: PuppeteerResponse): + def _update_context_actions( + self, request: ActionRequest, response: PuppeteerResponse + ): context_id = response.context_id context_actions = self.context_actions[context_id] @@ -130,7 +135,9 @@ def _restore_context(self, puppeteer_request: PuppeteerRequest, response): if context_id in self.context_actions: # Restoring restoring_request = puppeteer_request.replace( - action=Compose(self.context_actions.pop(context_id), puppeteer_request.action), + action=Compose( + self.context_actions.pop(context_id), puppeteer_request.action + ), context_id=None, page_id=None, ) From b61a245be0a2096a901cae2ec91f6e85688aea3f Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 31 Oct 2024 13:11:57 +0300 Subject: [PATCH 18/18] linter --- scrapypuppeteer/middlewares/restore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapypuppeteer/middlewares/restore.py b/scrapypuppeteer/middlewares/restore.py index 9efc5c4..51f5df5 100644 --- a/scrapypuppeteer/middlewares/restore.py +++ b/scrapypuppeteer/middlewares/restore.py @@ -1,7 +1,7 @@ import json import logging from http import HTTPStatus -from typing import Union, Dict +from typing import Dict, Union from scrapy.crawler import Crawler