EricLBuehler · EricLBuehler · Jan 22, 2025 · Jan 22, 2025
diff --git a/docs/UQFF/LAYOUT.md b/docs/UQFF/LAYOUT.md
@@ -1,6 +1,6 @@
 # UQFF internal structure
 
-The following describes the exact memory layout of HQFF tensors of version 0.1.0.
+The following describes the exact memory layout of UQFF tensors of version 0.1.0.
 
 ## ToC
 - [GGUF quantization](#gguf-quantization)
@@ -14,7 +14,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (0) | u8 | little endian  |
 | Tensor data length in bytes | u32 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
@@ -27,7 +27,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 ## Unquantized layers
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (1) | u8 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
 | **Array** Weight tensor data, see [docs](#standard-tensors) | See [docs](#standard-tensors) | See [docs](#standard-tensors)  |
@@ -36,7 +36,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 ## FP8 layers
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (1) | u8 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
 | **Array** Weight tensor data, see [docs](#standard-tensors) | See [docs](#standard-tensors) | See [docs](#standard-tensors)  |
@@ -49,7 +49,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 ## HQQ quantization
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (2) | u8 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
 | **Array** Q weight, see [docs](#standard-tensors) | See [docs](#standard-tensors) | See [docs](#standard-tensors) |
@@ -67,7 +67,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 ## FP8 layers
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (3) | u8 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
 | **Array** Weight tensor data, see [docs](#standard-tensors) | See [docs](#standard-tensors) | See [docs](#standard-tensors)  | 

diff --git a/mistralrs-core/src/pipeline/isq.rs b/mistralrs-core/src/pipeline/isq.rs
@@ -685,7 +685,7 @@ pub trait IsqModel {
                     if let Some(artifact) = artifact_isqs.get(&i) {
                         let artifact = artifact.data();
                         // NOTE(EricLBuehler): isq type is ALWAYS byte 4 (5th) of the tensor.
-                        let isq_type = artifact[4];
+                        let isq_type = artifact[mistralrs_quant::UQFF_QUANT_TYPE_OFFSET];
                         let deserialized = match QuantizedSerdeType::try_from(isq_type as usize)? {
                             QuantizedSerdeType::Gguf => {
                                 GgufMatMul::deserialize(Cow::from(artifact), &devices[i])?

diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
@@ -22,7 +22,7 @@ use crate::lora::Ordering;
 use crate::paged_attention::{calculate_cache_config, AttentionImplementation, CacheEngine};
 use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
 use crate::pipeline::get_chat_template;
-use crate::pipeline::isq::{UqffFullSer, UQFF_RESIDUAL_SAFETENSORS};
+use crate::pipeline::isq::UqffFullSer;
 use crate::pipeline::sampling::sample_and_add_toks;
 use crate::pipeline::text_models_inputs_processor::make_prompt_chunk;
 use crate::pipeline::{ChatTemplate, LocalModelPaths};
@@ -37,13 +37,14 @@ use crate::{
     normal_model_loader, xlora_model_loader, DeviceMapSetting, PagedAttentionConfig, Pipeline,
     Topology, TryIntoDType,
 };
-use anyhow::{Context, Result};
+use anyhow::Result;
 use candle_core::{Device, Tensor, Var};
 use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
-use mistralrs_quant::IsqType;
+use mistralrs_quant::{GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
 use rand_isaac::Isaac64Rng;
 use regex_automata::meta::Regex;
 use std::any::Any;
+use std::borrow::Cow;
 use std::fs;
 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
@@ -290,38 +291,46 @@ impl Loader for NormalLoader {
             // Match logic below where UQFF has priority
             let (layer_sizes_in_bytes, non_mapped_size_in_bytes, total_model_size_in_bytes) =
                 if let Some(serialized) = &*self.from_uqff.read().unwrap() {
-                    let parent = serialized
-                        .parent()
-                        .context("Target UQFF path must have a filename!")?;
-                    let residual = parent.join(UQFF_RESIDUAL_SAFETENSORS);
-
-                    let ser_total_size = {
+                    let weight_pack_factor = {
                         let ser_artifacts = unsafe {
                             candle_core::safetensors::MmapedSafetensors::new(serialized)?
                         };
-                        ser_artifacts
-                            .tensors()
-                            .iter()
-                            .map(|(_, t)| t.data().len())
-                            .sum::<usize>()
-                    };
-                    let res_total_size = {
-                        let res_artifacts =
-                            unsafe { candle_core::safetensors::MmapedSafetensors::new(residual)? };
-                        res_artifacts
-                            .tensors()
-                            .iter()
-                            .map(|(_, t)| t.data().len())
-                            .sum::<usize>()
+                        let mut total_pack_factors = 0;
+                        let total_tensors = ser_artifacts.tensors().len();
+                        for (_, artifact) in ser_artifacts.tensors() {
+                            let artifact = artifact.data();
+                            // NOTE(EricLBuehler): isq type is ALWAYS byte 4 (5th) of the tensor.
+                            let isq_type = artifact[mistralrs_quant::UQFF_QUANT_TYPE_OFFSET];
+                            let pack_factor = match QuantizedSerdeType::try_from(isq_type as usize)?
+                            {
+                                QuantizedSerdeType::Hqq => {
+                                    HqqLayer::get_isq_type_from_uqff(Cow::Borrowed(artifact))?
+                                        .pack_factor(dtype)
+                                }
+                                QuantizedSerdeType::Gguf => {
+                                    GgufMatMul::get_isq_type_from_uqff(Cow::Borrowed(artifact))?
+                                        .pack_factor(dtype)
+                                }
+                                QuantizedSerdeType::Fp8 => IsqType::F8E4M3.pack_factor(dtype),
+                                QuantizedSerdeType::Unquant => 1,
+                            };
+                            total_pack_factors += pack_factor;
+                        }
+
+                        total_pack_factors / total_tensors
                     };
-                    let size_per_layer = ser_total_size / self.inner.num_layers(&config)?;
 
-                    // This is not completely correct but hopefully close enough.
-                    // For example, the norms are not necessarily correctly done.
+                    let layer_sizes_in_bytes =
+                        self.inner
+                            .layer_sizes_in_bytes(&config, dtype, weight_pack_factor)?;
+                    let non_mapped_size_in_bytes =
+                        self.inner
+                            .non_mapped_size_in_bytes(&config, dtype, weight_pack_factor)?;
+                    let layer_sizes_sum = layer_sizes_in_bytes.iter().sum::<usize>();
                     (
-                        vec![size_per_layer; self.inner.num_layers(&config)?],
-                        res_total_size,
-                        ser_total_size,
+                        layer_sizes_in_bytes,
+                        non_mapped_size_in_bytes,
+                        layer_sizes_sum + non_mapped_size_in_bytes,
                     )
                 } else if let Some(isq) = in_situ_quant {
                     let weight_pack_factor = isq.pack_factor(dtype);

diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs
@@ -14,7 +14,6 @@ use super::{
 use crate::device_map::{self, DeviceMapper};
 use crate::paged_attention::{calculate_cache_config, AttentionImplementation, CacheEngine};
 use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
-use crate::pipeline::isq::UQFF_RESIDUAL_SAFETENSORS;
 use crate::pipeline::llg::build_tok_env;
 use crate::pipeline::sampling::sample_and_add_toks;
 use crate::pipeline::text_models_inputs_processor::make_prompt_chunk;
@@ -32,13 +31,14 @@ use crate::{
     AnyMoeExpertType, DeviceMapSetting, Ordering, PagedAttentionConfig, Pipeline, Topology,
     TryIntoDType,
 };
-use anyhow::{Context, Result};
+use anyhow::Result;
 use candle_core::{Device, Tensor, Var};
 use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
-use mistralrs_quant::IsqType;
+use mistralrs_quant::{GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
 use rand_isaac::Isaac64Rng;
 use regex_automata::meta::Regex;
 use std::any::Any;
+use std::borrow::Cow;
 use std::fs;
 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
@@ -219,38 +219,46 @@ impl Loader for VisionLoader {
             // Match logic below where UQFF has priority
             let (layer_sizes_in_bytes, non_mapped_size_in_bytes, total_model_size_in_bytes) =
                 if let Some(serialized) = &*self.from_uqff.read().unwrap() {
-                    let parent = serialized
-                        .parent()
-                        .context("Target UQFF path must have a filename!")?;
-                    let residual = parent.join(UQFF_RESIDUAL_SAFETENSORS);
-
-                    let ser_total_size = {
+                    let weight_pack_factor = {
                         let ser_artifacts = unsafe {
                             candle_core::safetensors::MmapedSafetensors::new(serialized)?
                         };
-                        ser_artifacts
-                            .tensors()
-                            .iter()
-                            .map(|(_, t)| t.data().len())
-                            .sum::<usize>()
-                    };
-                    let res_total_size = {
-                        let res_artifacts =
-                            unsafe { candle_core::safetensors::MmapedSafetensors::new(residual)? };
-                        res_artifacts
-                            .tensors()
-                            .iter()
-                            .map(|(_, t)| t.data().len())
-                            .sum::<usize>()
+                        let mut total_pack_factors = 0;
+                        let total_tensors = ser_artifacts.tensors().len();
+                        for (_, artifact) in ser_artifacts.tensors() {
+                            let artifact = artifact.data();
+                            // NOTE(EricLBuehler): isq type is ALWAYS byte 4 (5th) of the tensor.
+                            let isq_type = artifact[mistralrs_quant::UQFF_QUANT_TYPE_OFFSET];
+                            let pack_factor = match QuantizedSerdeType::try_from(isq_type as usize)?
+                            {
+                                QuantizedSerdeType::Hqq => {
+                                    HqqLayer::get_isq_type_from_uqff(Cow::Borrowed(artifact))?
+                                        .pack_factor(dtype)
+                                }
+                                QuantizedSerdeType::Gguf => {
+                                    GgufMatMul::get_isq_type_from_uqff(Cow::Borrowed(artifact))?
+                                        .pack_factor(dtype)
+                                }
+                                QuantizedSerdeType::Fp8 => IsqType::F8E4M3.pack_factor(dtype),
+                                QuantizedSerdeType::Unquant => 1,
+                            };
+                            total_pack_factors += pack_factor;
+                        }
+
+                        total_pack_factors / total_tensors
                     };
-                    let size_per_layer = ser_total_size / self.inner.num_layers(&config)?;
 
-                    // This is not completely correct but hopefully close enough.
-                    // For example, the norms are not necessarily correctly done.
+                    let layer_sizes_in_bytes =
+                        self.inner
+                            .layer_sizes_in_bytes(&config, dtype, weight_pack_factor)?;
+                    let non_mapped_size_in_bytes =
+                        self.inner
+                            .non_mapped_size_in_bytes(&config, dtype, weight_pack_factor)?;
+                    let layer_sizes_sum = layer_sizes_in_bytes.iter().sum::<usize>();
                     (
-                        vec![size_per_layer; self.inner.num_layers(&config)?],
-                        res_total_size,
-                        ser_total_size + res_total_size,
+                        layer_sizes_in_bytes,
+                        non_mapped_size_in_bytes,
+                        layer_sizes_sum + non_mapped_size_in_bytes,
                     )
                 } else if let Some(isq) = in_situ_quant {
                     let weight_pack_factor = isq.pack_factor(dtype);

diff --git a/mistralrs-quant/src/fp8/mod.rs b/mistralrs-quant/src/fp8/mod.rs
@@ -16,7 +16,7 @@ use crate::{
     cublaslt::{maybe_init_cublas_lt_wrapper, F8MatmulOutType, CUBLASLT_HANDLE},
     utils::{
         deserialize_tensor, read_dtype, serialize_tensor, version_is_compatible, write_dtype,
-        HQFF_VERSION,
+        UQFF_VERSION,
     },
     DummyLayer, IsqType, QuantMethod, QuantMethodConfig, QuantizedConfig, QuantizedSerde,
     QuantizedSerdeType, UnquantLinear,
@@ -189,7 +189,7 @@ impl QuantMethod for FP8Linear {
 // Serialization structure:
 //
 // -----------------------
-// HQFF version, u32, little endian
+// UQFF version, u32, little endian
 // -----------------------
 // ISQ type (3 for fp8), u8, little endian
 // -----------------------
@@ -218,7 +218,8 @@ impl QuantizedSerde for FP8Linear {
     fn serialize(&self) -> Result<Cow<[u8]>> {
         let mut buffer = Vec::new();
 
-        buffer.extend(&HQFF_VERSION.to_le_bytes());
+        // Version is always first!
+        buffer.extend(&UQFF_VERSION.to_le_bytes());
 
         // ISQ type for fp8 is 3
         buffer.push(QuantizedSerdeType::Fp8 as u8);

diff --git a/mistralrs-quant/src/gguf/mod.rs b/mistralrs-quant/src/gguf/mod.rs
@@ -14,7 +14,7 @@ use candle_nn::Module;
 
 use crate::{
     generate_isq, generate_isq_imatrix,
-    utils::{deserialize_tensor, serialize_tensor, version_is_compatible, HQFF_VERSION},
+    utils::{deserialize_tensor, serialize_tensor, version_is_compatible, UQFF_VERSION},
     IsqType, QuantMethod, QuantMethodConfig, QuantizedSerde, QuantizedSerdeType,
 };
 
@@ -161,7 +161,7 @@ impl QuantMethod for GgufMatMul {
 // Serialization structure:
 //
 // -----------------------
-// HQFF version, u32, little endian
+// UQFF version, u32, little endian
 // -----------------------
 // ISQ type (0 for GGUF), u8, little endian
 // -----------------------
@@ -217,7 +217,8 @@ impl QuantizedSerde for GgufMatMul {
 
                 let mut buffer = Vec::new();
 
-                buffer.extend(&HQFF_VERSION.to_le_bytes());
+                // Version is always first!
+                buffer.extend(&UQFF_VERSION.to_le_bytes());
 
                 // ISQ type for GGUF is 0
                 buffer.push(QuantizedSerdeType::Gguf as u8);
@@ -255,7 +256,7 @@ impl QuantizedSerde for GgufMatMul {
     }
 
     fn deserialize(data: Cow<[u8]>, device: &Device) -> Result<Arc<dyn QuantMethod>> {
-        let mut buffer = Cursor::new(data.to_vec());
+        let mut buffer = Cursor::new(data);
 
         let version = buffer.read_u32::<LittleEndian>()?;
         if let Err(e) = version_is_compatible(version) {
@@ -274,6 +275,7 @@ impl QuantizedSerde for GgufMatMul {
 
         let has_bias = buffer.read_u8()? != 0;
 
+        // TODO: keep this in sync with get_isq_type_from_uqff!
         let dtype = buffer.read_u32::<LittleEndian>()?;
         let dtype = match dtype {
             0 => GgmlDType::F32,
@@ -319,3 +321,49 @@ impl QuantizedSerde for GgufMatMul {
         }))
     }
 }
+
+impl GgufMatMul {
+    pub fn get_isq_type_from_uqff(data: Cow<[u8]>) -> Result<IsqType> {
+        let mut buffer = Cursor::new(data);
+
+        let version = buffer.read_u32::<LittleEndian>()?;
+        if let Err(e) = version_is_compatible(version) {
+            return Err(candle_core::Error::wrap(e));
+        }
+
+        let isq_type = buffer.read_u8()? as usize;
+        if isq_type != QuantizedSerdeType::Gguf as usize {
+            candle_core::bail!(
+                "ISQ type ({isq_type}) doesn't match expected type {}",
+                QuantizedSerdeType::Gguf as usize
+            );
+        }
+
+        let _ = buffer.read_u32::<LittleEndian>()? as usize;
+
+        let _ = buffer.read_u8()? != 0;
+
+        let dtype = buffer.read_u32::<LittleEndian>()?;
+        let dtype = match dtype {
+            0 => GgmlDType::F32,
+            1 => GgmlDType::F16,
+            2 => GgmlDType::Q4_0,
+            3 => GgmlDType::Q4_1,
+            6 => GgmlDType::Q5_0,
+            7 => GgmlDType::Q5_1,
+            8 => GgmlDType::Q8_0,
+            9 => GgmlDType::Q8_1,
+            10 => GgmlDType::Q2K,
+            11 => GgmlDType::Q3K,
+            12 => GgmlDType::Q4K,
+            13 => GgmlDType::Q5K,
+            14 => GgmlDType::Q6K,
+            15 => GgmlDType::Q8K,
+            // https://github.com/ggerganov/ggml/blob/29d87fc6676e7ed0cdfdec0804b06001d9c2bb44/include/ggml.h#L389
+            30 => GgmlDType::BF16,
+            _ => candle_core::bail!("unknown dtype for quantized weight tensor {dtype}"),
+        };
+
+        IsqType::try_from(dtype)
+    }
+}