From b730fe3c73a198d6474cfbf249b1a2830e98992b Mon Sep 17 00:00:00 2001 From: Raphael Merx Date: Sun, 30 Apr 2023 12:36:42 +0800 Subject: [PATCH 1/3] Save as .html instead of .snapshot --- README.md | 40 ++++++++++++------------ wayback_machine_scraper/__main__.py | 4 +-- wayback_machine_scraper/mirror_spider.py | 4 +-- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 9bfada9..cf3aba0 100644 --- a/README.md +++ b/README.md @@ -64,8 +64,8 @@ optional arguments: will be automatically adjusted to match this target.Use values less than 1 to be polite and higher values to scrape more quickly. (default: 10.0) - -u, --unix Save snapshots as `UNIX_TIMESTAMP.snapshot` instead of - the default `YYYYmmddHHMMSS.snapshot`. (default: + -u, --unix Save snapshots as `UNIX_TIMESTAMP\.html` instead of + the default `YYYYmmddHHMMSS\.html`. (default: False) -v, --verbose Turn on debug logging. (default: False) ``` @@ -92,15 +92,15 @@ This produces a file structure of ``` website/ └── news.ycombinator.com - ├── 20070221033032.snapshot - ├── 20070226001637.snapshot - ├── 20070405032412.snapshot - ├── 20070405175109.snapshot - ├── 20070406195336.snapshot - ├── 20070601184317.snapshot - ├── 20070629033202.snapshot - ├── 20070630222527.snapshot - ├── 20070630222818.snapshot + ├── 20070221033032\.html + ├── 20070226001637\.html + ├── 20070405032412\.html + ├── 20070405175109\.html + ├── 20070406195336\.html + ├── 20070601184317\.html + ├── 20070629033202\.html + ├── 20070630222527\.html + ├── 20070630222818\.html └── etc. ``` @@ -120,11 +120,11 @@ which produces website/ └── news.ycombinator.com └── item?id=13857086 - ├── 20170313225853.snapshot - ├── 20170313231755.snapshot - ├── 20170314043150.snapshot - ├── 20170314165633.snapshot - └── 20170320205604.snapshot + ├── 20170313225853\.html + ├── 20170313231755\.html + ├── 20170314043150\.html + ├── 20170314165633\.html + └── 20170320205604\.html ``` ### A Full Site Crawl at One Point In Time @@ -142,13 +142,13 @@ produces a file structure of ``` website └── news.ycombinator.com - ├── 20080621143814.snapshot + ├── 20080621143814\.html ├── item?id=221868 - │   └── 20080622151531.snapshot + │   └── 20080622151531\.html ├── item?id=222157 - │   └── 20080622151822.snapshot + │   └── 20080622151822\.html ├── item?id=222341 - │   └── 20080620221102.snapshot + │   └── 20080620221102\.html └── etc. ``` diff --git a/wayback_machine_scraper/__main__.py b/wayback_machine_scraper/__main__.py index 3984dbb..4c2dfb8 100644 --- a/wayback_machine_scraper/__main__.py +++ b/wayback_machine_scraper/__main__.py @@ -73,8 +73,8 @@ def parse_args(): 'Use values less than 1 to be polite and higher values to scrape more quickly.' )) parser.add_argument('-u', '--unix', action='store_true', help=( - 'Save snapshots as `UNIX_TIMESTAMP.snapshot` instead of ' - 'the default `YYYYmmddHHMMSS.snapshot`.' + 'Save snapshots as `UNIX_TIMESTAMP.html` instead of ' + 'the default `YYYYmmddHHMMSS.html`.' )) parser.add_argument('-v', '--verbose', action='store_true', help=( 'Turn on debug logging.' diff --git a/wayback_machine_scraper/mirror_spider.py b/wayback_machine_scraper/mirror_spider.py index 89624fa..e01771e 100644 --- a/wayback_machine_scraper/mirror_spider.py +++ b/wayback_machine_scraper/mirror_spider.py @@ -60,9 +60,9 @@ def save_page(self, response): # construct the output filename time = response.meta['wayback_machine_time'] if self.unix: - filename = '{0}.snapshot'.format(time.timestamp()) + filename = '{0}.html'.format(time.timestamp()) else: - filename = '{0}.snapshot'.format(time.strftime(WaybackMachineMiddleware.timestamp_format)) + filename = '{0}.html'.format(time.strftime(WaybackMachineMiddleware.timestamp_format)) full_path = os.path.join(parent_directory, filename) # write out the file From da4fc2a13d7c7edce57703aac1c79d11b70f9819 Mon Sep 17 00:00:00 2001 From: Raphael Merx Date: Sun, 30 Apr 2023 12:39:20 +0800 Subject: [PATCH 2/3] Bump version to 1.1.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e30895b..b92283d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='wayback-machine-scraper', - version='1.0.7', + version='1.1.0', author='Evan Sangaline', author_email='evan@intoli.com', description=description, From 0fa0c93bb72ce9f9a1881c97fcca6efd7e6783d4 Mon Sep 17 00:00:00 2001 From: Raphael Merx Date: Mon, 1 May 2023 13:29:44 +0800 Subject: [PATCH 3/3] Fix help text for -o argument --- wayback_machine_scraper/__main__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/wayback_machine_scraper/__main__.py b/wayback_machine_scraper/__main__.py index 4c2dfb8..b317dab 100644 --- a/wayback_machine_scraper/__main__.py +++ b/wayback_machine_scraper/__main__.py @@ -50,8 +50,7 @@ def parse_args(): 'Can also be a full URL to specify starting points for the crawler.' )) parser.add_argument('-o', '--output', metavar='DIRECTORY', default='website', help=( - 'Specify the domain(s) to scrape. ' - 'Can also be a full URL to specify starting points for the crawler.' + 'Directory to save scraped files to.' )) parser.add_argument('-f', '--from', metavar='TIMESTAMP', default='10000101', help=( 'The timestamp for the beginning of the range to scrape. '