Merge branch 'main' into fix-optimum-quanto-version

singnet · Jan 6, 2025 · e31ca25 · e31ca25
2 parents d163a00 + 05f2c07
commit e31ca25
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 4 deletions.
diff --git a/multigen/worker.py b/multigen/worker.py
@@ -64,9 +64,15 @@ def _get_pipeline(self, pipe_class, model_id, model_type, cnet=None, quantize_dt
                 # use quantisation by default for now
                 cls = pipe_class._classflux
                 if device.type == 'cuda':
-                    quantize_dtype = qfloat8
-                    # offload_device = device.index
-                    # device = torch.device('cpu')
+                    mb = torch.cuda.get_device_properties(device.index).total_memory / 1024 / 1024
+                    # quantize if there is more than 23 GB of memory
+                    # if less use cpu offload
+                    if 23000 < mb:
+                        self.logger.debug(f"set quantisation for the pipe on cuda:{device.index} has {mb}Mb")
+                        quantize_dtype = qfloat8
+                    else:
+                        offload_device = device.index
+                        device = torch.device('cpu')
             else:
                 cls = pipe_class._class
             pipeline = self._loader.load_pipeline(cls, model_id, torch_dtype=torch.bfloat16, 

diff --git a/tests/test_worker_flux.py b/tests/test_worker_flux.py
@@ -68,7 +68,7 @@ def on_new_image(*args, **kwargs):
             nonlocal c
             c += 1
 
-        num_runs = 25
+        num_runs = 15
         for i in range(num_runs):
             if len(sessions) - 1 < i:
                 i %= len(sessions)