scrapinghub · kalessin · Jun 17, 2020 · Apr 1, 2020
diff --git a/setup.py b/setup.py
@@ -14,12 +14,11 @@
     install_requires = (
         'pyyaml>=3.12',
         'retrying>=1.3.3',
-        'scrapinghub>=2.0.3',
+        'scrapinghub[msgpack]>=2.3.1',
         'jinja2>=2.7.3',
         'sqlitedict==1.6.0',
         's3fs==0.2.0',
         'boto3>=1.9.92',
-        'msgpack-python',
     ),
     scripts = [],
     classifiers = [

diff --git a/shub_workflow/deliver.py b/shub_workflow/deliver.py
@@ -205,6 +205,8 @@ class S3DeliverScript(BaseScript):
     s3_success_file = False
     s3_bucket_name = None
 
+    default_sh_chunk_size = 1_000
+
     def __init__(self):
         super().__init__()
 
@@ -236,6 +238,10 @@ def add_argparser_options(self):
         self.argparser.add_argument('--test-mode', action='store_true',
                                     help='Run in test mode (performs all processes, but doesn\'t\
                                           upload files nor tag jobs)')
+        self.argparser.add_argument('--sh-chunk-size', type=int, default=self.default_sh_chunk_size, help=(
+            'Chunk/page size for downloading items from Scrapy Cloud. For tweaking memory consumption and speed.'
+            ' Note that the performance will depend on the sizes of individual items in the cloud.'
+        ))
 
     def gen_keyprefix(self, scrapername, job, item):
         formatted_datetime = self.start_datetime.strftime('%Y-%m-%dT%H:%M:%S')
@@ -244,7 +250,9 @@ def gen_keyprefix(self, scrapername, job, item):
     def _process_job_items(self, scrapername, spider_job):
         first_keyprefix = None
         job_item_count = 0
-        for item in spider_job.items.iter():
+        chunks = spider_job.items.list_iter(chunksize=self.args.sh_chunk_size)
+        items_iter = (item for chunk in chunks for item in chunk)
+        for item in items_iter:
             seen = False
             for field in self.dupes_filter.keys():
                 if field in item: