From daa7483a89f2325a1ad1fcc11a1c15f18c3b23cf Mon Sep 17 00:00:00 2001 From: matatonic Date: Sat, 14 Sep 2024 15:56:36 -0400 Subject: [PATCH] lora bug fix, +hyper-flux-8step & 16step. lora. --- README.md | 4 ++- config.default.json | 8 ++++++ config/lib/awportrait-lora.json | 2 +- config/lib/hyper-flux-16steps-lora.json | 35 +++++++++++++++++++++++++ config/lib/hyper-flux-8steps-lora.json | 35 +++++++++++++++++++++++++ images.py | 15 ++++++++--- 6 files changed, 93 insertions(+), 6 deletions(-) create mode 100644 config/lib/hyper-flux-16steps-lora.json create mode 100644 config/lib/hyper-flux-8steps-lora.json diff --git a/README.md b/README.md index 8a35d5c..18944ca 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,7 @@ Low VRAM options (<4GB VRAM, 34GB RAM, `+enable_sequential_cpu_offload`, float16 There are `-compile` variants of many models as well. Be advised that the first couple images in a compiled model will be very slow to generate. The server must load, perhaps quantize and compile, and then the generation is dynamically optimized over the next couple generations, the first image may be 10 minutes or more to prepare. Most models can generate dozens of images in that time, so only use compiled models if you know what you're doing. -And more, including `int8` quants, check out the `config/lib` folder for more examples, including lora options. +And more, including `int8` quants, check out the `config/lib` folder for more examples, including lora options such as ByteDance `hyper-flux-8steps-lora`. > Timings are casually measured at 1024x1024 standard on an Nvidia A100 and may vary wildly from your system. @@ -185,7 +185,9 @@ And more, including `int8` quants, check out the `config/lib` folder for more ex ## Performance Performance plots for A100 (80GB) and 4090 (24GB), batch size = 1. Click Details to expand. +
+ Performance details for A100 & 4090 ![alt text](processing_time_A100.png) diff --git a/config.default.json b/config.default.json index fbe9360..7f27b7e 100644 --- a/config.default.json +++ b/config.default.json @@ -63,6 +63,14 @@ "generator": "lib/sayakpaul-flux.1-merged-low.json" }, + "hyper-flux-8steps-lora": { + "generator": "lib/hyper-flux-8steps-lora.json" + }, + + "hyper-flux-16steps-lora": { + "generator": "lib/hyper-flux-16steps-lora.json" + }, + "dev": { "generator": "lib/flux.1-dev.json" }, diff --git a/config/lib/awportrait-lora.json b/config/lib/awportrait-lora.json index 176273a..ea09f2d 100644 --- a/config/lib/awportrait-lora.json +++ b/config/lib/awportrait-lora.json @@ -9,7 +9,7 @@ "weight_name": "AWPortrait-FL-lora.safetensors" }, "options": { - "lora_scale": 1.0 + "lora_scale": 1.0 } } ] diff --git a/config/lib/hyper-flux-16steps-lora.json b/config/lib/hyper-flux-16steps-lora.json new file mode 100644 index 0000000..5c02174 --- /dev/null +++ b/config/lib/hyper-flux-16steps-lora.json @@ -0,0 +1,35 @@ +{ + "pipeline": { + "pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev", + "torch_dtype": "bfloat16", + "Loras": [ + { + "weights": { + "pretrained_model_name_or_path_or_dict": "ByteDance/Hyper-SD", + "weight_name": "Hyper-FLUX.1-dev-16steps-lora.safetensors" + }, + "options": { + "lora_scale": 0.125 + } + } + ] + }, + "options": { + "enable_vae_slicing": true, + "enable_vae_tiling": true, + "to": { + "device": "cuda" + } + }, + "generation_kwargs": { + "standard": { + "guidance_scale": 3.5, + "num_inference_steps": 16 + }, + "hd": { + "guidance_scale": 3.5, + "num_inference_steps": 25 + } + } +} + \ No newline at end of file diff --git a/config/lib/hyper-flux-8steps-lora.json b/config/lib/hyper-flux-8steps-lora.json new file mode 100644 index 0000000..04ade91 --- /dev/null +++ b/config/lib/hyper-flux-8steps-lora.json @@ -0,0 +1,35 @@ +{ + "pipeline": { + "pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev", + "torch_dtype": "bfloat16", + "Loras": [ + { + "weights": { + "pretrained_model_name_or_path_or_dict": "ByteDance/Hyper-SD", + "weight_name": "Hyper-FLUX.1-dev-8steps-lora.safetensors" + }, + "options": { + "lora_scale": 0.125 + } + } + ] + }, + "options": { + "enable_vae_slicing": true, + "enable_vae_tiling": true, + "to": { + "device": "cuda" + } + }, + "generation_kwargs": { + "standard": { + "guidance_scale": 3.5, + "num_inference_steps": 8 + }, + "hd": { + "guidance_scale": 3.5, + "num_inference_steps": 12 + } + } +} + \ No newline at end of file diff --git a/images.py b/images.py index c5a12c6..b787b7e 100755 --- a/images.py +++ b/images.py @@ -144,11 +144,15 @@ async def load_flux_model(config: dict) -> FluxPipeline: # Loras for lora in loras: - logger.info(f"Loading Lora: args: {lora['weight_name']}") - lora_weights = lora.pop('weights') + + logger.info(f"Loading Lora: args: {lora_weights['weight_name']}") flux_pipe.load_lora_weights(**lora_weights) - flux_pipe.fuse_lora(lora_scale=lora.pop('lora_scale', 1.0)) + if 'options' in lora: + lora_scale=lora['options'].pop('lora_scale', 1.0) + else: + lora_scale=lora.pop('lora_scale', 1.0) + flux_pipe.fuse_lora(lora_scale=lora_scale) flux_pipe.unload_lora_weights() # This makes no noticeable difference for me, but YMMV @@ -350,7 +354,7 @@ def make_pngmetadata(): resp['data'].extend([img_dat]) - logger.debug(f"Generated {len(images)} {request.model} image(s) in {int(time.time()) - resp['created']}s") + logger.debug(f"Generated {len(images)} {request.model} image(s) in {time.time() - resp['created'] / 1000:.1f}s") return resp @@ -400,6 +404,9 @@ def parse_args(argv=None): torch._inductor.config.epilogue_fusion = False torch._inductor.config.coordinate_descent_check_all_directions = True + # from hyperflux + torch.backends.cuda.matmul.allow_tf32 = True + if args.seed is not None: random_seed = args.seed