Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use new .list_iter() method of sh client to avoid issues #8

Merged
merged 1 commit into from
Jun 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,11 @@
install_requires = (
'pyyaml>=3.12',
'retrying>=1.3.3',
'scrapinghub>=2.0.3',
'scrapinghub[msgpack]>=2.3.1',
'jinja2>=2.7.3',
'sqlitedict==1.6.0',
's3fs==0.2.0',
'boto3>=1.9.92',
'msgpack-python',
),
scripts = [],
classifiers = [
Expand Down
10 changes: 9 additions & 1 deletion shub_workflow/deliver.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ class S3DeliverScript(BaseScript):
s3_success_file = False
s3_bucket_name = None

default_sh_chunk_size = 1_000

def __init__(self):
super().__init__()

Expand Down Expand Up @@ -236,6 +238,10 @@ def add_argparser_options(self):
self.argparser.add_argument('--test-mode', action='store_true',
help='Run in test mode (performs all processes, but doesn\'t\
upload files nor tag jobs)')
self.argparser.add_argument('--sh-chunk-size', type=int, default=self.default_sh_chunk_size, help=(
'Chunk/page size for downloading items from Scrapy Cloud. For tweaking memory consumption and speed.'
' Note that the performance will depend on the sizes of individual items in the cloud.'
))

def gen_keyprefix(self, scrapername, job, item):
formatted_datetime = self.start_datetime.strftime('%Y-%m-%dT%H:%M:%S')
Expand All @@ -244,7 +250,9 @@ def gen_keyprefix(self, scrapername, job, item):
def _process_job_items(self, scrapername, spider_job):
first_keyprefix = None
job_item_count = 0
for item in spider_job.items.iter():
chunks = spider_job.items.list_iter(chunksize=self.args.sh_chunk_size)
items_iter = (item for chunk in chunks for item in chunk)
for item in items_iter:
seen = False
for field in self.dupes_filter.keys():
if field in item:
Expand Down