Skip to content

Commit

Permalink
lora bug fix, +hyper-flux-8step & 16step. lora.
Browse files Browse the repository at this point in the history
  • Loading branch information
matatonic committed Sep 14, 2024
1 parent 3758cc5 commit daa7483
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 6 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ Low VRAM options (<4GB VRAM, 34GB RAM, `+enable_sequential_cpu_offload`, float16

There are `-compile` variants of many models as well. Be advised that the first couple images in a compiled model will be very slow to generate. The server must load, perhaps quantize and compile, and then the generation is dynamically optimized over the next couple generations, the first image may be 10 minutes or more to prepare. Most models can generate dozens of images in that time, so only use compiled models if you know what you're doing.

And more, including `int8` quants, check out the `config/lib` folder for more examples, including lora options.
And more, including `int8` quants, check out the `config/lib` folder for more examples, including lora options such as ByteDance `hyper-flux-8steps-lora`.

> Timings are casually measured at 1024x1024 standard on an Nvidia A100 and may vary wildly from your system.
Expand All @@ -185,7 +185,9 @@ And more, including `int8` quants, check out the `config/lib` folder for more ex
## Performance

Performance plots for A100 (80GB) and 4090 (24GB), batch size = 1. Click Details to expand.

<details>
<summary> Performance details for A100 & 4090</summary>

![alt text](processing_time_A100.png)

Expand Down
8 changes: 8 additions & 0 deletions config.default.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@
"generator": "lib/sayakpaul-flux.1-merged-low.json"
},

"hyper-flux-8steps-lora": {
"generator": "lib/hyper-flux-8steps-lora.json"
},

"hyper-flux-16steps-lora": {
"generator": "lib/hyper-flux-16steps-lora.json"
},

"dev": {
"generator": "lib/flux.1-dev.json"
},
Expand Down
2 changes: 1 addition & 1 deletion config/lib/awportrait-lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"weight_name": "AWPortrait-FL-lora.safetensors"
},
"options": {
"lora_scale": 1.0
"lora_scale": 1.0
}
}
]
Expand Down
35 changes: 35 additions & 0 deletions config/lib/hyper-flux-16steps-lora.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"pipeline": {
"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev",
"torch_dtype": "bfloat16",
"Loras": [
{
"weights": {
"pretrained_model_name_or_path_or_dict": "ByteDance/Hyper-SD",
"weight_name": "Hyper-FLUX.1-dev-16steps-lora.safetensors"
},
"options": {
"lora_scale": 0.125
}
}
]
},
"options": {
"enable_vae_slicing": true,
"enable_vae_tiling": true,
"to": {
"device": "cuda"
}
},
"generation_kwargs": {
"standard": {
"guidance_scale": 3.5,
"num_inference_steps": 16
},
"hd": {
"guidance_scale": 3.5,
"num_inference_steps": 25
}
}
}

35 changes: 35 additions & 0 deletions config/lib/hyper-flux-8steps-lora.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"pipeline": {
"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev",
"torch_dtype": "bfloat16",
"Loras": [
{
"weights": {
"pretrained_model_name_or_path_or_dict": "ByteDance/Hyper-SD",
"weight_name": "Hyper-FLUX.1-dev-8steps-lora.safetensors"
},
"options": {
"lora_scale": 0.125
}
}
]
},
"options": {
"enable_vae_slicing": true,
"enable_vae_tiling": true,
"to": {
"device": "cuda"
}
},
"generation_kwargs": {
"standard": {
"guidance_scale": 3.5,
"num_inference_steps": 8
},
"hd": {
"guidance_scale": 3.5,
"num_inference_steps": 12
}
}
}

15 changes: 11 additions & 4 deletions images.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,15 @@ async def load_flux_model(config: dict) -> FluxPipeline:

# Loras
for lora in loras:
logger.info(f"Loading Lora: args: {lora['weight_name']}")

lora_weights = lora.pop('weights')

logger.info(f"Loading Lora: args: {lora_weights['weight_name']}")
flux_pipe.load_lora_weights(**lora_weights)
flux_pipe.fuse_lora(lora_scale=lora.pop('lora_scale', 1.0))
if 'options' in lora:
lora_scale=lora['options'].pop('lora_scale', 1.0)
else:
lora_scale=lora.pop('lora_scale', 1.0)
flux_pipe.fuse_lora(lora_scale=lora_scale)
flux_pipe.unload_lora_weights()

# This makes no noticeable difference for me, but YMMV
Expand Down Expand Up @@ -350,7 +354,7 @@ def make_pngmetadata():

resp['data'].extend([img_dat])

logger.debug(f"Generated {len(images)} {request.model} image(s) in {int(time.time()) - resp['created']}s")
logger.debug(f"Generated {len(images)} {request.model} image(s) in {time.time() - resp['created'] / 1000:.1f}s")

return resp

Expand Down Expand Up @@ -400,6 +404,9 @@ def parse_args(argv=None):
torch._inductor.config.epilogue_fusion = False
torch._inductor.config.coordinate_descent_check_all_directions = True

# from hyperflux
torch.backends.cuda.matmul.allow_tf32 = True

if args.seed is not None:
random_seed = args.seed

Expand Down

0 comments on commit daa7483

Please sign in to comment.