From 9d480960015d3693c917c528f57150fa079f5194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilija=20Kalini=C4=87?= Date: Wed, 11 Dec 2024 15:40:26 +0100 Subject: [PATCH 1/3] Fix sd3-lite clip-l model swapped outputs --- examples/diffusion/python_stable_diffusion_3/txt2img.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/diffusion/python_stable_diffusion_3/txt2img.py b/examples/diffusion/python_stable_diffusion_3/txt2img.py index d079041f309..e050527aad4 100644 --- a/examples/diffusion/python_stable_diffusion_3/txt2img.py +++ b/examples/diffusion/python_stable_diffusion_3/txt2img.py @@ -485,6 +485,11 @@ def encode_token_weights(self, model_name, token_weight_pairs): def get_embeddings(self, prompt_tokens): l_out, l_pooled = self.encode_token_weights("clip-l", prompt_tokens["l"]) + if l_out.shape != (1, 77, 768): + tmp = l_pooled + l_pooled = l_out + l_out = tmp + g_out, g_pooled = self.encode_token_weights("clip-g", prompt_tokens["g"]) if not self.skip_t5: From 9a9a849629916a4011c7f4581833c5e38febd92f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilija=20Kalini=C4=87?= Date: Tue, 17 Dec 2024 09:14:31 +0100 Subject: [PATCH 2/3] Refactor and add comment --- examples/diffusion/python_stable_diffusion_3/txt2img.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/diffusion/python_stable_diffusion_3/txt2img.py b/examples/diffusion/python_stable_diffusion_3/txt2img.py index e050527aad4..c4fb86c45f9 100644 --- a/examples/diffusion/python_stable_diffusion_3/txt2img.py +++ b/examples/diffusion/python_stable_diffusion_3/txt2img.py @@ -485,10 +485,9 @@ def encode_token_weights(self, model_name, token_weight_pairs): def get_embeddings(self, prompt_tokens): l_out, l_pooled = self.encode_token_weights("clip-l", prompt_tokens["l"]) + # stable-diffusion-3-lite-onnx has swapped outputs for clip-l text encoder if l_out.shape != (1, 77, 768): - tmp = l_pooled - l_pooled = l_out - l_out = tmp + l_out, l_pooled = l_pooled, l_out g_out, g_pooled = self.encode_token_weights("clip-g", prompt_tokens["g"]) From 542805a50d1300394995d606fcca81010bf9cc1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilija=20Kalini=C4=87?= Date: Thu, 19 Dec 2024 09:50:22 +0100 Subject: [PATCH 3/3] Fix formatting --- examples/diffusion/python_stable_diffusion_3/txt2img.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/diffusion/python_stable_diffusion_3/txt2img.py b/examples/diffusion/python_stable_diffusion_3/txt2img.py index c4fb86c45f9..995f68f5d22 100644 --- a/examples/diffusion/python_stable_diffusion_3/txt2img.py +++ b/examples/diffusion/python_stable_diffusion_3/txt2img.py @@ -488,7 +488,7 @@ def get_embeddings(self, prompt_tokens): # stable-diffusion-3-lite-onnx has swapped outputs for clip-l text encoder if l_out.shape != (1, 77, 768): l_out, l_pooled = l_pooled, l_out - + g_out, g_pooled = self.encode_token_weights("clip-g", prompt_tokens["g"]) if not self.skip_t5: