From 86182724c0937c6567dc155aaf23b07f957136f2 Mon Sep 17 00:00:00 2001 From: Ben Brown Date: Fri, 1 Mar 2024 09:56:24 +0000 Subject: [PATCH 1/2] Stream response as to not read the entire contents into memory --- htmlproofer/plugin.py | 7 ++++++- tests/unit/test_plugin.py | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/htmlproofer/plugin.py b/htmlproofer/plugin.py index 8550389..0dfe40a 100644 --- a/htmlproofer/plugin.py +++ b/htmlproofer/plugin.py @@ -146,7 +146,12 @@ def get_external_url(self, url, scheme, src_path): @lru_cache(maxsize=1000) def resolve_web_scheme(self, url: str) -> int: try: - response = self._session.get(url, timeout=URL_TIMEOUT) + response = self._session.get(url, timeout=URL_TIMEOUT, stream=True) + + # Download the entire contents as to not break previous behaviour. + for _ in response.iter_content(chunk_size=1024): + pass + return response.status_code except requests.exceptions.Timeout: return 504 diff --git a/tests/unit/test_plugin.py b/tests/unit/test_plugin.py index 2645779..8f0cd11 100644 --- a/tests/unit/test_plugin.py +++ b/tests/unit/test_plugin.py @@ -78,8 +78,10 @@ def test_on_post_page( }) # Always raise a 500 error - mock_requests.side_effect = [Mock(spec=Response, status_code=500)] link_to_500 = '' + iter_content = Mock() + iter_content.side_effect = link_to_500 + mock_requests.side_effect = [Mock(spec=Response, status_code=500, iter_content=iter_content)] plugin.files = empty_files page = Mock( From 08f01526824cfa01d3e283e90e85f9fa1b523792 Mon Sep 17 00:00:00 2001 From: Ben Brown Date: Fri, 1 Mar 2024 12:17:20 +0000 Subject: [PATCH 2/2] Add config option to skip downloading the body of a response Closes #76. --- README.md | 11 +++++++++++ htmlproofer/plugin.py | 8 +++++--- tests/integration/mkdocs.yml | 1 + 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3ee32b6..c2c2d6d 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,17 @@ plugins: validate_rendered_template: True ``` +### `skip_downloads` + +Optionally skip downloading of a remote URLs content via GET request. This can +considerably reduce the time taken to validate URLs. + +```yaml +plugins: + - htmlproofer: + skip_downloads: True +``` + ## Compatibility with `attr_list` extension If you need to manually specify anchors make use of the `attr_list` [extension](https://python-markdown.github.io/extensions/attr_list) in the markdown. diff --git a/htmlproofer/plugin.py b/htmlproofer/plugin.py index 0dfe40a..d1ea3f5 100644 --- a/htmlproofer/plugin.py +++ b/htmlproofer/plugin.py @@ -65,6 +65,7 @@ class HtmlProoferPlugin(BasePlugin): ('raise_error', config_options.Type(bool, default=False)), ('raise_error_after_finish', config_options.Type(bool, default=False)), ('raise_error_excludes', config_options.Type(dict, default={})), + ('skip_downloads', config_options.Type(bool, default=False)), ('validate_external_urls', config_options.Type(bool, default=True)), ('validate_rendered_template', config_options.Type(bool, default=False)), ('ignore_urls', config_options.Type(list, default=[])), @@ -148,9 +149,10 @@ def resolve_web_scheme(self, url: str) -> int: try: response = self._session.get(url, timeout=URL_TIMEOUT, stream=True) - # Download the entire contents as to not break previous behaviour. - for _ in response.iter_content(chunk_size=1024): - pass + if self.config['skip_downloads'] is False: + # Download the entire contents as to not break previous behaviour. + for _ in response.iter_content(chunk_size=1024): + pass return response.status_code except requests.exceptions.Timeout: diff --git a/tests/integration/mkdocs.yml b/tests/integration/mkdocs.yml index 2ca79d4..abe26a5 100644 --- a/tests/integration/mkdocs.yml +++ b/tests/integration/mkdocs.yml @@ -16,3 +16,4 @@ plugins: 'page2.html#BAD_ANCHOR', '../../../tests', ] + skip_downloads: True