using tempfile and memap to work on arrays in memory, first stores ea…

…ch array individually and then combines them
LCOGT · Mar 21, 2024 · 4846322 · 4846322
1 parent ab49450
commit 4846322
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 42 deletions.
diff --git a/datalab/datalab_session/data_operations/median.py b/datalab/datalab_session/data_operations/median.py
@@ -1,5 +1,7 @@
 from io import BytesIO
 import logging
+import os
+import tempfile
 
 import numpy as np
 from astropy.io import fits
@@ -49,48 +51,54 @@ def operate(self):
 
         log.info(f'Executing median operation on {file_count} files')
 
-         # fetch fits for all input data
-        image_data_list = []
-
-        for index, file_info in enumerate(input_files):
-            basename = file_info.get('basename', 'No basename found')
-
-            archive_record = get_archive_from_basename(basename)
-            try:
-                fits_url = archive_record[0].get('url', 'No URL found')
-            except IndexError:
-                continue
-
-            with fits.open(fits_url, use_fsspec=True) as hdu_list:
-                data = hdu_list['SCI'].data
-                image_data_list.append(data)
-                self.set_percent_completion((index) / file_count)
-
-        # Crop fits image data to be the same shape then stack
-        min_shape = min(arr.shape for arr in image_data_list)
-        cropped_data_list = [arr[:min_shape[0], :min_shape[1]] for arr in image_data_list]
-        stacked_data = np.stack(cropped_data_list, axis=2)
-
-        # Calculate a Median along the z axis
-        median = np.median(stacked_data, axis=2)
-
-        # Create a new Fits File
-        cache_key = self.generate_cache_key()
-        hdr = fits.Header([('KEY', cache_key)])
-        primary_hdu = fits.PrimaryHDU(header=hdr)
-        image_hdu = fits.ImageHDU(median)
-        hdu_list = fits.HDUList([primary_hdu, image_hdu])
-
-        fits_buffer = BytesIO()
-        hdu_list.writeto(fits_buffer)
-        fits_buffer.seek(0)
-
-        # Write the HDU List to the output FITS file in bitbucket
-        response = store_fits_output(cache_key, fits_buffer)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            memmap_paths = []
+
+            for index, file_info in enumerate(input_files):
+                basename = file_info.get('basename', 'No basename found')
+                archive_record = get_archive_from_basename(basename)
+
+                try:
+                    fits_url = archive_record[0].get('url', 'No URL found')
+                except IndexError:
+                    continue
+
+                with fits.open(fits_url, use_fsspec=True) as hdu_list:
+                    data = hdu_list['SCI'].data
+                    memmap_path = os.path.join(temp_dir, f'memmap_{index}.dat')
+                    memmap_array = np.memmap(memmap_path, dtype=data.dtype, mode='w+', shape=data.shape)
+                    memmap_array[:] = data[:]
+                    memmap_paths.append(memmap_path)
+
+                self.set_percent_completion(index / file_count)
+
+            image_data_list = [
+                np.memmap(path, dtype=np.float32, mode='r', shape=memmap_array.shape)
+                for path in memmap_paths
+            ]
+
+            # Crop fits image data to be the same shape then stack
+            min_shape = min(arr.shape for arr in image_data_list)
+            cropped_data_list = [arr[:min_shape[0], :min_shape[1]] for arr in image_data_list]
+            stacked_data = np.stack(cropped_data_list, axis=2)
+
+            # Calculate a Median along the z axis
+            median = np.median(stacked_data, axis=2)
+
+            cache_key = self.generate_cache_key()
+            header = fits.Header([('KEY', cache_key)])
+            primary_hdu = fits.PrimaryHDU(header=header)
+            image_hdu = fits.ImageHDU(median)
+            hdu_list = fits.HDUList([primary_hdu, image_hdu])
+
+            fits_buffer = BytesIO()
+            hdu_list.writeto(fits_buffer)
+            fits_buffer.seek(0)
+
+            # Write the HDU List to the output FITS file in bitbucket
+            response = store_fits_output(cache_key, fits_buffer)
 
         # TODO: No output yet, need to build a thumbnail service
-        output = {
-            'output_files': []
-        }
+        output = {'output_files': []}
         self.set_percent_completion(file_count / file_count)
         self.set_output(output)
diff --git a/datalab/datalab_session/util.py b/datalab/datalab_session/util.py
@@ -24,7 +24,7 @@ def store_fits_output(item_key, fits_buffer):
 
 def get_archive_from_basename(basename: str) -> dict:
   """
-  Querys and returns an archive file from the Archive
+  Queries and returns an archive file from the Archive
 
   Keyword Arguements:
   basename -- name to query