lora bug fix, +hyper-flux-8step & 16step. lora.

matatonic · Sep 14, 2024 · daa7483 · daa7483
1 parent 3758cc5
commit daa7483
Show file tree

Hide file tree

Showing 6 changed files with 93 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -174,7 +174,7 @@ Low VRAM options (<4GB VRAM, 34GB RAM, `+enable_sequential_cpu_offload`, float16
 
 There are `-compile` variants of many models as well. Be advised that the first couple images in a compiled model will be very slow to generate. The server must load, perhaps quantize and compile, and then the generation is dynamically optimized over the next couple generations, the first image may be 10 minutes or more to prepare. Most models can generate dozens of images in that time, so only use compiled models if you know what you're doing.
 
-And more, including `int8` quants, check out the `config/lib` folder for more examples, including lora options.
+And more, including `int8` quants, check out the `config/lib` folder for more examples, including lora options such as ByteDance `hyper-flux-8steps-lora`.
 
 > Timings are casually measured at 1024x1024 standard on an Nvidia A100 and may vary wildly from your system.
 
@@ -185,7 +185,9 @@ And more, including `int8` quants, check out the `config/lib` folder for more ex
 ## Performance
 
 Performance plots for A100 (80GB) and 4090 (24GB), batch size = 1. Click Details to expand.
+
 <details>
+<summary> Performance details for A100 & 4090</summary>
 
 ![alt text](processing_time_A100.png)
 

diff --git a/config.default.json b/config.default.json
@@ -63,6 +63,14 @@
       "generator": "lib/sayakpaul-flux.1-merged-low.json"
     },
 
+    "hyper-flux-8steps-lora": {
+      "generator": "lib/hyper-flux-8steps-lora.json"
+    },
+
+    "hyper-flux-16steps-lora": {
+      "generator": "lib/hyper-flux-16steps-lora.json"
+    },
+
     "dev": {
       "generator": "lib/flux.1-dev.json"
     },

diff --git a/config/lib/awportrait-lora.json b/config/lib/awportrait-lora.json
@@ -9,7 +9,7 @@
           "weight_name": "AWPortrait-FL-lora.safetensors"
         },
         "options": {
-            "lora_scale": 1.0
+          "lora_scale": 1.0
         }
       }
     ]

diff --git a/config/lib/hyper-flux-16steps-lora.json b/config/lib/hyper-flux-16steps-lora.json
@@ -0,0 +1,35 @@
+{
+  "pipeline": {
+    "pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev",
+    "torch_dtype": "bfloat16",
+    "Loras": [
+      {
+        "weights": {
+          "pretrained_model_name_or_path_or_dict": "ByteDance/Hyper-SD",
+          "weight_name": "Hyper-FLUX.1-dev-16steps-lora.safetensors"
+        },
+        "options": {
+          "lora_scale": 0.125
+        }
+      }
+    ]
+  },
+  "options": {
+    "enable_vae_slicing": true,
+    "enable_vae_tiling": true,
+    "to": {
+      "device": "cuda"
+    }
+  },
+  "generation_kwargs": {
+    "standard": {
+      "guidance_scale": 3.5,
+      "num_inference_steps": 16
+    },
+    "hd": {
+      "guidance_scale": 3.5,
+      "num_inference_steps": 25
+    }
+  }
+}
+
diff --git a/config/lib/hyper-flux-8steps-lora.json b/config/lib/hyper-flux-8steps-lora.json
@@ -0,0 +1,35 @@
+{
+  "pipeline": {
+    "pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev",
+    "torch_dtype": "bfloat16",
+    "Loras": [
+      {
+        "weights": {
+          "pretrained_model_name_or_path_or_dict": "ByteDance/Hyper-SD",
+          "weight_name": "Hyper-FLUX.1-dev-8steps-lora.safetensors"
+        },
+        "options": {
+          "lora_scale": 0.125
+        }
+      }
+    ]
+  },
+  "options": {
+    "enable_vae_slicing": true,
+    "enable_vae_tiling": true,
+    "to": {
+      "device": "cuda"
+    }
+  },
+  "generation_kwargs": {
+    "standard": {
+      "guidance_scale": 3.5,
+      "num_inference_steps": 8
+    },
+    "hd": {
+      "guidance_scale": 3.5,
+      "num_inference_steps": 12
+    }
+  }
+}
+
diff --git a/images.py b/images.py
@@ -144,11 +144,15 @@ async def load_flux_model(config: dict) -> FluxPipeline:
 
     # Loras
     for lora in loras:
-        logger.info(f"Loading Lora: args: {lora['weight_name']}")
-
         lora_weights = lora.pop('weights')
+
+        logger.info(f"Loading Lora: args: {lora_weights['weight_name']}")
         flux_pipe.load_lora_weights(**lora_weights)
-        flux_pipe.fuse_lora(lora_scale=lora.pop('lora_scale', 1.0))
+        if 'options' in lora:
+            lora_scale=lora['options'].pop('lora_scale', 1.0)
+        else:
+            lora_scale=lora.pop('lora_scale', 1.0)
+        flux_pipe.fuse_lora(lora_scale=lora_scale)
         flux_pipe.unload_lora_weights()
 
     # This makes no noticeable difference for me, but YMMV
@@ -350,7 +354,7 @@ def make_pngmetadata():
 
                 resp['data'].extend([img_dat])
 
-        logger.debug(f"Generated {len(images)} {request.model} image(s) in {int(time.time()) - resp['created']}s")
+        logger.debug(f"Generated {len(images)} {request.model} image(s) in {time.time() - resp['created'] / 1000:.1f}s")
 
         return resp
 
@@ -400,6 +404,9 @@ def parse_args(argv=None):
     torch._inductor.config.epilogue_fusion = False
     torch._inductor.config.coordinate_descent_check_all_directions = True
 
+    # from hyperflux
+    torch.backends.cuda.matmul.allow_tf32 = True
+
     if args.seed is not None:
         random_seed = args.seed