From daa7483a89f2325a1ad1fcc11a1c15f18c3b23cf Mon Sep 17 00:00:00 2001
From: matatonic <matatonic-git@zhero.org>
Date: Sat, 14 Sep 2024 15:56:36 -0400
Subject: [PATCH] lora bug fix, +hyper-flux-8step & 16step. lora.

---
 README.md                               |  4 ++-
 config.default.json                     |  8 ++++++
 config/lib/awportrait-lora.json         |  2 +-
 config/lib/hyper-flux-16steps-lora.json | 35 +++++++++++++++++++++++++
 config/lib/hyper-flux-8steps-lora.json  | 35 +++++++++++++++++++++++++
 images.py                               | 15 ++++++++---
 6 files changed, 93 insertions(+), 6 deletions(-)
 create mode 100644 config/lib/hyper-flux-16steps-lora.json
 create mode 100644 config/lib/hyper-flux-8steps-lora.json
diff --git a/README.md b/README.md
index 8a35d5c..18944ca 100644
--- a/README.md
+++ b/README.md
@@ -174,7 +174,7 @@ Low VRAM options (<4GB VRAM, 34GB RAM, `+enable_sequential_cpu_offload`, float16
 
 There are `-compile` variants of many models as well. Be advised that the first couple images in a compiled model will be very slow to generate. The server must load, perhaps quantize and compile, and then the generation is dynamically optimized over the next couple generations, the first image may be 10 minutes or more to prepare. Most models can generate dozens of images in that time, so only use compiled models if you know what you're doing.
 
-And more, including `int8` quants, check out the `config/lib` folder for more examples, including lora options.
+And more, including `int8` quants, check out the `config/lib` folder for more examples, including lora options such as ByteDance `hyper-flux-8steps-lora`.
 
 > Timings are casually measured at 1024x1024 standard on an Nvidia A100 and may vary wildly from your system.
 
@@ -185,7 +185,9 @@ And more, including `int8` quants, check out the `config/lib` folder for more ex
 ## Performance
 
 Performance plots for A100 (80GB) and 4090 (24GB), batch size = 1. Click Details to expand.
+
 <details>
+<summary> Performance details for A100 & 4090</summary>
 
 ![alt text](processing_time_A100.png)
 
diff --git a/config.default.json b/config.default.json
index fbe9360..7f27b7e 100644
--- a/config.default.json
+++ b/config.default.json
@@ -63,6 +63,14 @@
       "generator": "lib/sayakpaul-flux.1-merged-low.json"
     },
 
+    "hyper-flux-8steps-lora": {
+      "generator": "lib/hyper-flux-8steps-lora.json"
+    },
+
+    "hyper-flux-16steps-lora": {
+      "generator": "lib/hyper-flux-16steps-lora.json"
+    },
+
     "dev": {
       "generator": "lib/flux.1-dev.json"
     },
diff --git a/config/lib/awportrait-lora.json b/config/lib/awportrait-lora.json
index 176273a..ea09f2d 100644
--- a/config/lib/awportrait-lora.json
+++ b/config/lib/awportrait-lora.json
@@ -9,7 +9,7 @@
           "weight_name": "AWPortrait-FL-lora.safetensors"
         },
         "options": {
-            "lora_scale": 1.0
+          "lora_scale": 1.0
         }
       }
     ]
diff --git a/config/lib/hyper-flux-16steps-lora.json b/config/lib/hyper-flux-16steps-lora.json
new file mode 100644
index 0000000..5c02174
--- /dev/null
+++ b/config/lib/hyper-flux-16steps-lora.json
@@ -0,0 +1,35 @@
+{
+  "pipeline": {
+    "pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev",
+    "torch_dtype": "bfloat16",
+    "Loras": [
+      {
+        "weights": {
+          "pretrained_model_name_or_path_or_dict": "ByteDance/Hyper-SD",
+          "weight_name": "Hyper-FLUX.1-dev-16steps-lora.safetensors"
+        },
+        "options": {
+          "lora_scale": 0.125
+        }
+      }
+    ]
+  },
+  "options": {
+    "enable_vae_slicing": true,
+    "enable_vae_tiling": true,
+    "to": {
+      "device": "cuda"
+    }
+  },
+  "generation_kwargs": {
+    "standard": {
+      "guidance_scale": 3.5,
+      "num_inference_steps": 16
+    },
+    "hd": {
+      "guidance_scale": 3.5,
+      "num_inference_steps": 25
+    }
+  }
+}
+  
\ No newline at end of file
diff --git a/config/lib/hyper-flux-8steps-lora.json b/config/lib/hyper-flux-8steps-lora.json
new file mode 100644
index 0000000..04ade91
--- /dev/null
+++ b/config/lib/hyper-flux-8steps-lora.json
@@ -0,0 +1,35 @@
+{
+  "pipeline": {
+    "pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev",
+    "torch_dtype": "bfloat16",
+    "Loras": [
+      {
+        "weights": {
+          "pretrained_model_name_or_path_or_dict": "ByteDance/Hyper-SD",
+          "weight_name": "Hyper-FLUX.1-dev-8steps-lora.safetensors"
+        },
+        "options": {
+          "lora_scale": 0.125
+        }
+      }
+    ]
+  },
+  "options": {
+    "enable_vae_slicing": true,
+    "enable_vae_tiling": true,
+    "to": {
+      "device": "cuda"
+    }
+  },
+  "generation_kwargs": {
+    "standard": {
+      "guidance_scale": 3.5,
+      "num_inference_steps": 8
+    },
+    "hd": {
+      "guidance_scale": 3.5,
+      "num_inference_steps": 12
+    }
+  }
+}
+  
\ No newline at end of file
diff --git a/images.py b/images.py
index c5a12c6..b787b7e 100755
--- a/images.py
+++ b/images.py
@@ -144,11 +144,15 @@ async def load_flux_model(config: dict) -> FluxPipeline:
 
     # Loras
     for lora in loras:
-        logger.info(f"Loading Lora: args: {lora['weight_name']}")
-
         lora_weights = lora.pop('weights')
+
+        logger.info(f"Loading Lora: args: {lora_weights['weight_name']}")
         flux_pipe.load_lora_weights(**lora_weights)
-        flux_pipe.fuse_lora(lora_scale=lora.pop('lora_scale', 1.0))
+        if 'options' in lora:
+            lora_scale=lora['options'].pop('lora_scale', 1.0)
+        else:
+            lora_scale=lora.pop('lora_scale', 1.0)
+        flux_pipe.fuse_lora(lora_scale=lora_scale)
         flux_pipe.unload_lora_weights()
 
     # This makes no noticeable difference for me, but YMMV
@@ -350,7 +354,7 @@ def make_pngmetadata():
 
                 resp['data'].extend([img_dat])
 
-        logger.debug(f"Generated {len(images)} {request.model} image(s) in {int(time.time()) - resp['created']}s")
+        logger.debug(f"Generated {len(images)} {request.model} image(s) in {time.time() - resp['created'] / 1000:.1f}s")
 
         return resp
 
@@ -400,6 +404,9 @@ def parse_args(argv=None):
     torch._inductor.config.epilogue_fusion = False
     torch._inductor.config.coordinate_descent_check_all_directions = True
 
+    # from hyperflux
+    torch.backends.cuda.matmul.allow_tf32 = True
+
     if args.seed is not None:
         random_seed = args.seed